1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetOptions.h"
63 #define DEBUG_TYPE "x86-isel"
65 STATISTIC(NumTailCalls, "Number of tail calls");
67 static cl::opt<bool> ExperimentalVectorWideningLegalization(
68 "x86-experimental-vector-widening-legalization", cl::init(false),
69 cl::desc("Enable an experimental vector type legalization through widening "
70 "rather than promotion."),
73 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
74 const X86Subtarget &STI)
75 : TargetLowering(TM), Subtarget(STI) {
76 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
77 X86ScalarSSEf64 = Subtarget.hasSSE2();
78 X86ScalarSSEf32 = Subtarget.hasSSE1();
79 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
81 // Set up the TargetLowering object.
83 // X86 is weird. It always uses i8 for shift amounts and setcc results.
84 setBooleanContents(ZeroOrOneBooleanContent);
85 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
86 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
88 // For 64-bit, since we have so many registers, use the ILP scheduler.
89 // For 32-bit, use the register pressure specific scheduling.
90 // For Atom, always use ILP scheduling.
91 if (Subtarget.isAtom())
92 setSchedulingPreference(Sched::ILP);
93 else if (Subtarget.is64Bit())
94 setSchedulingPreference(Sched::ILP);
96 setSchedulingPreference(Sched::RegPressure);
97 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
98 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
100 // Bypass expensive divides on Atom when compiling with O2.
101 if (TM.getOptLevel() >= CodeGenOpt::Default) {
102 if (Subtarget.hasSlowDivide32())
103 addBypassSlowDiv(32, 8);
104 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
105 addBypassSlowDiv(64, 16);
108 if (Subtarget.isTargetKnownWindowsMSVC() ||
109 Subtarget.isTargetWindowsItanium()) {
110 // Setup Windows compiler runtime calls.
111 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
112 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
113 setLibcallName(RTLIB::SREM_I64, "_allrem");
114 setLibcallName(RTLIB::UREM_I64, "_aullrem");
115 setLibcallName(RTLIB::MUL_I64, "_allmul");
116 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
119 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
120 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
123 if (Subtarget.isTargetDarwin()) {
124 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
125 setUseUnderscoreSetJmp(false);
126 setUseUnderscoreLongJmp(false);
127 } else if (Subtarget.isTargetWindowsGNU()) {
128 // MS runtime is weird: it exports _setjmp, but longjmp!
129 setUseUnderscoreSetJmp(true);
130 setUseUnderscoreLongJmp(false);
132 setUseUnderscoreSetJmp(true);
133 setUseUnderscoreLongJmp(true);
136 // Set up the register classes.
137 addRegisterClass(MVT::i8, &X86::GR8RegClass);
138 addRegisterClass(MVT::i16, &X86::GR16RegClass);
139 addRegisterClass(MVT::i32, &X86::GR32RegClass);
140 if (Subtarget.is64Bit())
141 addRegisterClass(MVT::i64, &X86::GR64RegClass);
143 for (MVT VT : MVT::integer_valuetypes())
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
146 // We don't accept any truncstore of integer registers.
147 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
148 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
151 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
152 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
154 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
156 // SETOEQ and SETUNE require checking two conditions.
157 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
158 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
159 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
161 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
162 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
164 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
166 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
167 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
168 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
170 if (Subtarget.is64Bit()) {
171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
172 // f32/f64 are legal, f80 is custom.
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
177 } else if (!Subtarget.useSoftFloat()) {
178 // We have an algorithm for SSE2->double, and we turn this into a
179 // 64-bit FILD followed by conditional FADD for other targets.
180 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
181 // We have an algorithm for SSE2, and we turn this into a 64-bit
182 // FILD or VCVTUSI2SS/SD for other targets.
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
186 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
188 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
189 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
191 if (!Subtarget.useSoftFloat()) {
192 // SSE has no i16 to fp conversion, only i32.
193 if (X86ScalarSSEf32) {
194 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
195 // f32 and f64 cases are Legal, f80 case is not
196 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
198 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
199 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
206 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
208 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
209 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
211 if (!Subtarget.useSoftFloat()) {
212 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
213 // are Legal, f80 is custom lowered.
214 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
215 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
217 if (X86ScalarSSEf32) {
218 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
219 // f32 and f64 cases are Legal, f80 case is not
220 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
222 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
223 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
228 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
231 // Handle FP_TO_UINT by promoting the destination to a larger signed
233 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
234 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
235 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
237 if (Subtarget.is64Bit()) {
238 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
239 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
240 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
241 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
244 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
246 } else if (!Subtarget.useSoftFloat()) {
247 // Since AVX is a superset of SSE3, only check for SSE here.
248 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
249 // Expand FP_TO_UINT into a select.
250 // FIXME: We would like to use a Custom expander here eventually to do
251 // the optimal thing for SSE vs. the default expansion in the legalizer.
252 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
254 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
255 // With SSE3 we can use fisttpll to convert to a signed i64; without
256 // SSE, we're stuck with a fistpll.
257 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
262 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
263 if (!X86ScalarSSEf64) {
264 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
265 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
266 if (Subtarget.is64Bit()) {
267 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
268 // Without SSE, i64->f64 goes through memory.
269 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
271 } else if (!Subtarget.is64Bit())
272 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
274 // Scalar integer divide and remainder are lowered to use operations that
275 // produce two results, to match the available instructions. This exposes
276 // the two-result form to trivial CSE, which is able to combine x/y and x%y
277 // into a single instruction.
279 // Scalar integer multiply-high is also lowered to use two-result
280 // operations, to match the available instructions. However, plain multiply
281 // (low) operations are left as Legal, as there are single-result
282 // instructions for this in x86. Using the two-result multiply instructions
283 // when both high and low results are needed must be arranged by dagcombine.
284 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
285 setOperationAction(ISD::MULHS, VT, Expand);
286 setOperationAction(ISD::MULHU, VT, Expand);
287 setOperationAction(ISD::SDIV, VT, Expand);
288 setOperationAction(ISD::UDIV, VT, Expand);
289 setOperationAction(ISD::SREM, VT, Expand);
290 setOperationAction(ISD::UREM, VT, Expand);
293 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
294 if (VT == MVT::i64 && !Subtarget.is64Bit())
296 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
297 setOperationAction(ISD::ADDC, VT, Custom);
298 setOperationAction(ISD::ADDE, VT, Custom);
299 setOperationAction(ISD::SUBC, VT, Custom);
300 setOperationAction(ISD::SUBE, VT, Custom);
303 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
304 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
305 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
306 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
307 setOperationAction(ISD::BR_CC, VT, Expand);
308 setOperationAction(ISD::SELECT_CC, VT, Expand);
310 if (Subtarget.is64Bit())
311 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
312 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
313 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
314 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
315 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
317 setOperationAction(ISD::FREM , MVT::f32 , Expand);
318 setOperationAction(ISD::FREM , MVT::f64 , Expand);
319 setOperationAction(ISD::FREM , MVT::f80 , Expand);
320 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
322 // Promote the i8 variants and force them on up to i32 which has a shorter
324 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
325 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
326 if (!Subtarget.hasBMI()) {
327 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
328 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
330 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
331 if (Subtarget.is64Bit()) {
332 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
333 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
337 if (Subtarget.hasLZCNT()) {
338 // When promoting the i8 variants, force them to i32 for a shorter
340 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
341 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
344 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
347 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
348 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355 // Special handling for half-precision floating point conversions.
356 // If we don't have F16C support, then lower half float conversions
357 // into library calls.
358 if (Subtarget.useSoftFloat() ||
359 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
360 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
364 // There's never any support for operations beyond MVT::f32.
365 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
366 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
367 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
368 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
370 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
371 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
372 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
373 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
374 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
375 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
377 if (Subtarget.hasPOPCNT()) {
378 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
380 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
381 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
382 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
383 if (Subtarget.is64Bit())
384 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
387 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
389 if (!Subtarget.hasMOVBE())
390 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
392 // These should be promoted to a larger select which is supported.
393 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
394 // X86 wants to expand cmov itself.
395 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
396 setOperationAction(ISD::SELECT, VT, Custom);
397 setOperationAction(ISD::SETCC, VT, Custom);
399 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
400 if (VT == MVT::i64 && !Subtarget.is64Bit())
402 setOperationAction(ISD::SELECT, VT, Custom);
403 setOperationAction(ISD::SETCC, VT, Custom);
404 setOperationAction(ISD::SETCCE, VT, Custom);
406 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
407 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
408 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
409 // support continuation, user-level threading, and etc.. As a result, no
410 // other SjLj exception interfaces are implemented and please don't build
411 // your own exception handling based on them.
412 // LLVM/Clang supports zero-cost DWARF exception handling.
413 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
414 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
415 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
416 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
417 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
420 for (auto VT : { MVT::i32, MVT::i64 }) {
421 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 setOperationAction(ISD::ConstantPool , VT, Custom);
424 setOperationAction(ISD::JumpTable , VT, Custom);
425 setOperationAction(ISD::GlobalAddress , VT, Custom);
426 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
427 setOperationAction(ISD::ExternalSymbol , VT, Custom);
428 setOperationAction(ISD::BlockAddress , VT, Custom);
430 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
431 for (auto VT : { MVT::i32, MVT::i64 }) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 setOperationAction(ISD::SHL_PARTS, VT, Custom);
435 setOperationAction(ISD::SRA_PARTS, VT, Custom);
436 setOperationAction(ISD::SRL_PARTS, VT, Custom);
439 if (Subtarget.hasSSE1())
440 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
442 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
444 // Expand certain atomics
445 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
446 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
447 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
448 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
449 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
450 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
451 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
452 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
455 if (Subtarget.hasCmpxchg16b()) {
456 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
459 // FIXME - use subtarget debug flags
460 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
461 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
462 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
463 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
466 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
467 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
469 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
470 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
472 setOperationAction(ISD::TRAP, MVT::Other, Legal);
473 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
475 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
476 setOperationAction(ISD::VASTART , MVT::Other, Custom);
477 setOperationAction(ISD::VAEND , MVT::Other, Expand);
478 bool Is64Bit = Subtarget.is64Bit();
479 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
480 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
482 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
483 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
485 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
487 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
488 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
489 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
491 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
492 // f32 and f64 use SSE.
493 // Set up the FP register classes.
494 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
495 : &X86::FR32RegClass);
496 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
497 : &X86::FR64RegClass);
499 for (auto VT : { MVT::f32, MVT::f64 }) {
500 // Use ANDPD to simulate FABS.
501 setOperationAction(ISD::FABS, VT, Custom);
503 // Use XORP to simulate FNEG.
504 setOperationAction(ISD::FNEG, VT, Custom);
506 // Use ANDPD and ORPD to simulate FCOPYSIGN.
507 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
509 // We don't support sin/cos/fmod
510 setOperationAction(ISD::FSIN , VT, Expand);
511 setOperationAction(ISD::FCOS , VT, Expand);
512 setOperationAction(ISD::FSINCOS, VT, Expand);
515 // Lower this to MOVMSK plus an AND.
516 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
517 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
519 // Expand FP immediates into loads from the stack, except for the special
521 addLegalFPImmediate(APFloat(+0.0)); // xorpd
522 addLegalFPImmediate(APFloat(+0.0f)); // xorps
523 } else if (UseX87 && X86ScalarSSEf32) {
524 // Use SSE for f32, x87 for f64.
525 // Set up the FP register classes.
526 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
527 : &X86::FR32RegClass);
528 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
530 // Use ANDPS to simulate FABS.
531 setOperationAction(ISD::FABS , MVT::f32, Custom);
533 // Use XORP to simulate FNEG.
534 setOperationAction(ISD::FNEG , MVT::f32, Custom);
536 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
538 // Use ANDPS and ORPS to simulate FCOPYSIGN.
539 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
540 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
542 // We don't support sin/cos/fmod
543 setOperationAction(ISD::FSIN , MVT::f32, Expand);
544 setOperationAction(ISD::FCOS , MVT::f32, Expand);
545 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
547 // Special cases we handle for FP constants.
548 addLegalFPImmediate(APFloat(+0.0f)); // xorps
549 addLegalFPImmediate(APFloat(+0.0)); // FLD0
550 addLegalFPImmediate(APFloat(+1.0)); // FLD1
551 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
552 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
554 if (!TM.Options.UnsafeFPMath) {
555 setOperationAction(ISD::FSIN , MVT::f64, Expand);
556 setOperationAction(ISD::FCOS , MVT::f64, Expand);
557 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
560 // f32 and f64 in x87.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
563 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
565 for (auto VT : { MVT::f32, MVT::f64 }) {
566 setOperationAction(ISD::UNDEF, VT, Expand);
567 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
569 if (!TM.Options.UnsafeFPMath) {
570 setOperationAction(ISD::FSIN , VT, Expand);
571 setOperationAction(ISD::FCOS , VT, Expand);
572 setOperationAction(ISD::FSINCOS, VT, Expand);
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
579 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
580 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
581 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
582 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
585 // We don't support FMA.
586 setOperationAction(ISD::FMA, MVT::f64, Expand);
587 setOperationAction(ISD::FMA, MVT::f32, Expand);
589 // Long double always uses X87, except f128 in MMX.
591 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
592 addRegisterClass(MVT::f128, &X86::FR128RegClass);
593 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
594 setOperationAction(ISD::FABS , MVT::f128, Custom);
595 setOperationAction(ISD::FNEG , MVT::f128, Custom);
596 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
599 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
600 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
601 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
603 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
604 addLegalFPImmediate(TmpFlt); // FLD0
606 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
609 APFloat TmpFlt2(+1.0);
610 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
612 addLegalFPImmediate(TmpFlt2); // FLD1
613 TmpFlt2.changeSign();
614 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
617 if (!TM.Options.UnsafeFPMath) {
618 setOperationAction(ISD::FSIN , MVT::f80, Expand);
619 setOperationAction(ISD::FCOS , MVT::f80, Expand);
620 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
623 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
624 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
625 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
626 setOperationAction(ISD::FRINT, MVT::f80, Expand);
627 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
628 setOperationAction(ISD::FMA, MVT::f80, Expand);
631 // Always use a library call for pow.
632 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
633 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
634 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
636 setOperationAction(ISD::FLOG, MVT::f80, Expand);
637 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
638 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
639 setOperationAction(ISD::FEXP, MVT::f80, Expand);
640 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
641 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
642 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
644 // Some FP actions are always expanded for vector types.
645 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
646 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
647 setOperationAction(ISD::FSIN, VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 setOperationAction(ISD::FCOS, VT, Expand);
650 setOperationAction(ISD::FREM, VT, Expand);
651 setOperationAction(ISD::FPOWI, VT, Expand);
652 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
653 setOperationAction(ISD::FPOW, VT, Expand);
654 setOperationAction(ISD::FLOG, VT, Expand);
655 setOperationAction(ISD::FLOG2, VT, Expand);
656 setOperationAction(ISD::FLOG10, VT, Expand);
657 setOperationAction(ISD::FEXP, VT, Expand);
658 setOperationAction(ISD::FEXP2, VT, Expand);
661 // First set operation action for all vector types to either promote
662 // (for widening) or expand (for scalarization). Then we will selectively
663 // turn on ones that can be effectively codegen'd.
664 for (MVT VT : MVT::vector_valuetypes()) {
665 setOperationAction(ISD::SDIV, VT, Expand);
666 setOperationAction(ISD::UDIV, VT, Expand);
667 setOperationAction(ISD::SREM, VT, Expand);
668 setOperationAction(ISD::UREM, VT, Expand);
669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
670 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
671 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
672 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
673 setOperationAction(ISD::FMA, VT, Expand);
674 setOperationAction(ISD::FFLOOR, VT, Expand);
675 setOperationAction(ISD::FCEIL, VT, Expand);
676 setOperationAction(ISD::FTRUNC, VT, Expand);
677 setOperationAction(ISD::FRINT, VT, Expand);
678 setOperationAction(ISD::FNEARBYINT, VT, Expand);
679 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
680 setOperationAction(ISD::MULHS, VT, Expand);
681 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
682 setOperationAction(ISD::MULHU, VT, Expand);
683 setOperationAction(ISD::SDIVREM, VT, Expand);
684 setOperationAction(ISD::UDIVREM, VT, Expand);
685 setOperationAction(ISD::CTPOP, VT, Expand);
686 setOperationAction(ISD::CTTZ, VT, Expand);
687 setOperationAction(ISD::CTLZ, VT, Expand);
688 setOperationAction(ISD::ROTL, VT, Expand);
689 setOperationAction(ISD::ROTR, VT, Expand);
690 setOperationAction(ISD::BSWAP, VT, Expand);
691 setOperationAction(ISD::SETCC, VT, Expand);
692 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
693 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
694 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
695 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
696 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
697 setOperationAction(ISD::TRUNCATE, VT, Expand);
698 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
699 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
700 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
701 setOperationAction(ISD::SELECT_CC, VT, Expand);
702 for (MVT InnerVT : MVT::vector_valuetypes()) {
703 setTruncStoreAction(InnerVT, VT, Expand);
705 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
706 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
708 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
709 // types, we have to deal with them whether we ask for Expansion or not.
710 // Setting Expand causes its own optimisation problems though, so leave
712 if (VT.getVectorElementType() == MVT::i1)
713 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
715 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
716 // split/scalarized right now.
717 if (VT.getVectorElementType() == MVT::f16)
718 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
722 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
723 // with -msoft-float, disable use of MMX as well.
724 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
725 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
726 // No operations on x86mmx supported, everything uses intrinsics.
729 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
733 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
734 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
735 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
736 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
738 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
740 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
741 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
744 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
745 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
746 : &X86::VR128RegClass);
748 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
749 // registers cannot be used even for integer operations.
750 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
752 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
754 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
756 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
757 : &X86::VR128RegClass);
759 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
760 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
761 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
762 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
763 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
764 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
765 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
766 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
767 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
768 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
769 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
770 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
771 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
773 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
774 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
775 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
776 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
778 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
779 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
780 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
781 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
783 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
784 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
787 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
789 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
790 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
791 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
792 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
794 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
795 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
796 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
797 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
799 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
800 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
801 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
802 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
803 setOperationAction(ISD::VSELECT, VT, Custom);
804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
807 // We support custom legalizing of sext and anyext loads for specific
808 // memory vector types which we can load as a scalar (or sequence of
809 // scalars) and extend in-register to a legal 128-bit vector type. For sext
810 // loads these must work with a single scalar load.
811 for (MVT VT : MVT::integer_vector_valuetypes()) {
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
814 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
823 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
824 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
826 setOperationAction(ISD::VSELECT, VT, Custom);
828 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
831 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
832 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
835 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
836 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
837 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
841 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
844 // Custom lower v2i64 and v2f64 selects.
845 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
846 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
849 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
852 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
856 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
858 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
859 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
862 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
864 for (MVT VT : MVT::fp_vector_valuetypes())
865 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
867 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
869 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
873 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
875 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
876 setOperationAction(ISD::SRL, VT, Custom);
877 setOperationAction(ISD::SHL, VT, Custom);
878 setOperationAction(ISD::SRA, VT, Custom);
881 // In the customized shift lowering, the legal cases in AVX2 will be
883 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
884 setOperationAction(ISD::SRL, VT, Custom);
885 setOperationAction(ISD::SHL, VT, Custom);
886 setOperationAction(ISD::SRA, VT, Custom);
890 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
891 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
892 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
893 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
894 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
895 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
898 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
899 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
900 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
901 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
902 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
903 setOperationAction(ISD::FRINT, RoundedTy, Legal);
904 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
907 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
908 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
909 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
910 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
911 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
916 // FIXME: Do we need to handle scalar-to-vector here?
917 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
919 // We directly match byte blends in the backend as they match the VSELECT
921 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
923 // SSE41 brings specific instructions for doing vector sign extend even in
924 // cases where we don't have SRA.
925 for (MVT VT : MVT::integer_vector_valuetypes()) {
926 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
927 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
931 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
932 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
933 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
934 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
935 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
936 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
937 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
939 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
940 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
941 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
942 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
943 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
944 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
946 // i8 vectors are custom because the source register and source
947 // source memory operand types are not the same width.
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
951 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
952 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
953 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
954 setOperationAction(ISD::ROTL, VT, Custom);
956 // XOP can efficiently perform BITREVERSE with VPPERM.
957 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
958 setOperationAction(ISD::BITREVERSE, VT, Custom);
960 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
961 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
962 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
966 bool HasInt256 = Subtarget.hasInt256();
968 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
969 : &X86::VR256RegClass);
970 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
976 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
981 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
982 setOperationAction(ISD::FFLOOR, VT, Legal);
983 setOperationAction(ISD::FCEIL, VT, Legal);
984 setOperationAction(ISD::FTRUNC, VT, Legal);
985 setOperationAction(ISD::FRINT, VT, Legal);
986 setOperationAction(ISD::FNEARBYINT, VT, Legal);
987 setOperationAction(ISD::FNEG, VT, Custom);
988 setOperationAction(ISD::FABS, VT, Custom);
989 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
992 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
993 // even though v8i16 is a legal type.
994 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
995 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
996 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
998 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
999 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1000 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1002 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1003 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1005 for (MVT VT : MVT::fp_vector_valuetypes())
1006 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1008 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1009 setOperationAction(ISD::SRL, VT, Custom);
1010 setOperationAction(ISD::SHL, VT, Custom);
1011 setOperationAction(ISD::SRA, VT, Custom);
1014 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1015 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1016 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1017 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1020 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1021 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1023 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1024 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1025 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1026 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1027 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1028 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1029 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1031 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1032 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1035 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1037 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1038 setOperationAction(ISD::CTPOP, VT, Custom);
1039 setOperationAction(ISD::CTTZ, VT, Custom);
1040 setOperationAction(ISD::CTLZ, VT, Custom);
1043 if (Subtarget.hasAnyFMA()) {
1044 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1045 MVT::v2f64, MVT::v4f64 })
1046 setOperationAction(ISD::FMA, VT, Legal);
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1055 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1059 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1060 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1065 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1067 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1069 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1070 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1077 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1079 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1080 // when we have a 256bit-wide blend with immediate.
1081 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1083 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1084 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1085 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1086 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1087 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1088 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1089 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1091 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1092 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1093 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1094 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1095 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1096 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1099 // In the customized shift lowering, the legal cases in AVX2 will be
1101 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1102 setOperationAction(ISD::SRL, VT, Custom);
1103 setOperationAction(ISD::SHL, VT, Custom);
1104 setOperationAction(ISD::SRA, VT, Custom);
1107 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1108 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1109 setOperationAction(ISD::MLOAD, VT, Legal);
1110 setOperationAction(ISD::MSTORE, VT, Legal);
1113 // Extract subvector is special because the value type
1114 // (result) is 128-bit but the source is 256-bit wide.
1115 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1116 MVT::v4f32, MVT::v2f64 }) {
1117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1120 // Custom lower several nodes for 256-bit types.
1121 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1122 MVT::v8f32, MVT::v4f64 }) {
1123 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1124 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1125 setOperationAction(ISD::VSELECT, VT, Custom);
1126 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1127 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1128 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1129 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1130 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1134 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1136 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1137 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1138 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1139 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1140 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1141 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1142 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1146 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1147 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1148 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1149 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1150 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1152 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1153 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1154 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1156 for (MVT VT : MVT::fp_vector_valuetypes())
1157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1159 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1160 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1161 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1162 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1163 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1164 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1165 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1167 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1168 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1169 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1170 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1171 setOperationAction(ISD::XOR, MVT::i1, Legal);
1172 setOperationAction(ISD::OR, MVT::i1, Legal);
1173 setOperationAction(ISD::AND, MVT::i1, Legal);
1174 setOperationAction(ISD::SUB, MVT::i1, Custom);
1175 setOperationAction(ISD::ADD, MVT::i1, Custom);
1176 setOperationAction(ISD::MUL, MVT::i1, Custom);
1178 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1179 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1180 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1181 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1182 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1183 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1184 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1185 setTruncStoreAction(VT, MaskVT, Custom);
1188 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1189 setOperationAction(ISD::FNEG, VT, Custom);
1190 setOperationAction(ISD::FABS, VT, Custom);
1191 setOperationAction(ISD::FMA, VT, Legal);
1192 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1195 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1196 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1197 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1199 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1200 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1202 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1203 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1204 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1205 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1206 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1207 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1208 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1209 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1210 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1211 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1212 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1213 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1214 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1215 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1217 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1218 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1219 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1221 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1222 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1223 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1224 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1225 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1226 if (Subtarget.hasVLX()){
1227 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1228 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1229 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1230 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1231 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1233 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1234 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1235 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1236 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1237 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1239 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1240 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1241 setOperationAction(ISD::MLOAD, VT, Custom);
1242 setOperationAction(ISD::MSTORE, VT, Custom);
1245 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1246 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1248 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1249 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1251 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1252 if (Subtarget.hasDQI()) {
1253 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1254 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1255 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1256 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1257 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1258 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1259 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1260 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1261 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1262 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1263 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1264 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1266 if (Subtarget.hasVLX()) {
1267 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1268 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1269 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1270 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1273 if (Subtarget.hasVLX()) {
1274 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1275 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1276 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1278 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1279 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1280 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1281 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1282 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1284 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1285 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1286 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1287 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1288 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1289 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1290 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1291 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1292 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1293 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1294 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1297 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1298 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1299 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1300 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1301 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1302 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1303 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1304 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1305 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1306 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1309 if (Subtarget.hasDQI()) {
1310 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1311 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1313 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1314 setOperationAction(ISD::FFLOOR, VT, Legal);
1315 setOperationAction(ISD::FCEIL, VT, Legal);
1316 setOperationAction(ISD::FTRUNC, VT, Legal);
1317 setOperationAction(ISD::FRINT, VT, Legal);
1318 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1321 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1324 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1325 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1326 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1329 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1330 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1331 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1332 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1334 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1335 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1337 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1339 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1340 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1341 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1342 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1343 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1344 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1345 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1346 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1347 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1348 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1349 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1350 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1352 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1353 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1354 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1355 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1356 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1357 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1358 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1359 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1361 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1362 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1363 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1364 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1365 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1366 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1368 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1370 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1371 setOperationAction(ISD::SRL, VT, Custom);
1372 setOperationAction(ISD::SHL, VT, Custom);
1373 setOperationAction(ISD::SRA, VT, Custom);
1374 setOperationAction(ISD::CTPOP, VT, Custom);
1375 setOperationAction(ISD::CTTZ, VT, Custom);
1378 // Need to promote to 64-bit even though we have 32-bit masked instructions
1379 // because the IR optimizers rearrange bitcasts around logic ops leaving
1380 // too many variations to handle if we don't promote them.
1381 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1382 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1383 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1385 if (Subtarget.hasCDI()) {
1386 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1387 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1389 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1390 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1391 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1392 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1394 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1395 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1397 if (Subtarget.hasVLX()) {
1398 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1399 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1400 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1401 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1403 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1404 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1405 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1406 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1409 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1410 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1411 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1412 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1413 } // Subtarget.hasCDI()
1415 if (Subtarget.hasDQI()) {
1416 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1417 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1418 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1419 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1422 // Custom lower several nodes.
1423 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1424 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1425 setOperationAction(ISD::MGATHER, VT, Custom);
1426 setOperationAction(ISD::MSCATTER, VT, Custom);
1428 // Extract subvector is special because the value type
1429 // (result) is 256-bit but the source is 512-bit wide.
1430 // 128-bit was made Custom under AVX1.
1431 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1432 MVT::v8f32, MVT::v4f64 })
1433 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1434 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1435 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1436 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1438 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1439 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1440 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1441 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1442 setOperationAction(ISD::VSELECT, VT, Legal);
1443 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1444 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1445 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1446 setOperationAction(ISD::MLOAD, VT, Legal);
1447 setOperationAction(ISD::MSTORE, VT, Legal);
1448 setOperationAction(ISD::MGATHER, VT, Legal);
1449 setOperationAction(ISD::MSCATTER, VT, Custom);
1451 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1452 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1453 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1457 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1458 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1459 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1461 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1462 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1464 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1465 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1466 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1467 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1468 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1469 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1471 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1472 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1473 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1474 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1475 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1476 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1477 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1478 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1480 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1481 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1482 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1483 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1484 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1485 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1486 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1487 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1489 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1490 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1491 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1492 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1493 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1494 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1495 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1496 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1497 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1498 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1500 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1501 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1502 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1503 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1506 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1507 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1508 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1509 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1510 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1511 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1512 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1513 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1514 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1515 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1516 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1517 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1519 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1520 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1521 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1522 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1523 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1524 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1525 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1526 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1528 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1530 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1531 if (Subtarget.hasVLX()) {
1532 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1533 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1536 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1537 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1538 setOperationAction(ISD::MLOAD, VT, Action);
1539 setOperationAction(ISD::MSTORE, VT, Action);
1542 if (Subtarget.hasCDI()) {
1543 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1544 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1547 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VSELECT, VT, Legal);
1550 setOperationAction(ISD::SRL, VT, Custom);
1551 setOperationAction(ISD::SHL, VT, Custom);
1552 setOperationAction(ISD::SRA, VT, Custom);
1553 setOperationAction(ISD::MLOAD, VT, Legal);
1554 setOperationAction(ISD::MSTORE, VT, Legal);
1555 setOperationAction(ISD::CTPOP, VT, Custom);
1556 setOperationAction(ISD::CTTZ, VT, Custom);
1558 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1559 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1560 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1563 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1564 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1565 if (Subtarget.hasVLX()) {
1566 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1567 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1568 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1573 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1574 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1575 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1577 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1578 setOperationAction(ISD::ADD, VT, Expand);
1579 setOperationAction(ISD::SUB, VT, Expand);
1580 setOperationAction(ISD::MUL, VT, Expand);
1581 setOperationAction(ISD::VSELECT, VT, Expand);
1583 setOperationAction(ISD::TRUNCATE, VT, Custom);
1584 setOperationAction(ISD::SETCC, VT, Custom);
1585 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1587 setOperationAction(ISD::SELECT, VT, Custom);
1588 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1589 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1592 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1593 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1594 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1595 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1597 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1598 setOperationAction(ISD::SMAX, VT, Legal);
1599 setOperationAction(ISD::UMAX, VT, Legal);
1600 setOperationAction(ISD::SMIN, VT, Legal);
1601 setOperationAction(ISD::UMIN, VT, Legal);
1605 // We want to custom lower some of our intrinsics.
1606 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1607 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1608 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1609 if (!Subtarget.is64Bit()) {
1610 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1611 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1614 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1615 // handle type legalization for these operations here.
1617 // FIXME: We really should do custom legalization for addition and
1618 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1619 // than generic legalization for 64-bit multiplication-with-overflow, though.
1620 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1621 if (VT == MVT::i64 && !Subtarget.is64Bit())
1623 // Add/Sub/Mul with overflow operations are custom lowered.
1624 setOperationAction(ISD::SADDO, VT, Custom);
1625 setOperationAction(ISD::UADDO, VT, Custom);
1626 setOperationAction(ISD::SSUBO, VT, Custom);
1627 setOperationAction(ISD::USUBO, VT, Custom);
1628 setOperationAction(ISD::SMULO, VT, Custom);
1629 setOperationAction(ISD::UMULO, VT, Custom);
1632 if (!Subtarget.is64Bit()) {
1633 // These libcalls are not available in 32-bit.
1634 setLibcallName(RTLIB::SHL_I128, nullptr);
1635 setLibcallName(RTLIB::SRL_I128, nullptr);
1636 setLibcallName(RTLIB::SRA_I128, nullptr);
1639 // Combine sin / cos into one node or libcall if possible.
1640 if (Subtarget.hasSinCos()) {
1641 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1642 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1643 if (Subtarget.isTargetDarwin()) {
1644 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1645 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1646 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1647 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1651 if (Subtarget.isTargetWin64()) {
1652 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1653 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1654 setOperationAction(ISD::SREM, MVT::i128, Custom);
1655 setOperationAction(ISD::UREM, MVT::i128, Custom);
1656 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1657 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1660 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1661 // is. We should promote the value to 64-bits to solve this.
1662 // This is what the CRT headers do - `fmodf` is an inline header
1663 // function casting to f64 and calling `fmod`.
1664 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1665 Subtarget.isTargetWindowsItanium()))
1666 for (ISD::NodeType Op :
1667 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1668 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1669 if (isOperationExpand(Op, MVT::f32))
1670 setOperationAction(Op, MVT::f32, Promote);
1672 // We have target-specific dag combine patterns for the following nodes:
1673 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1674 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1675 setTargetDAGCombine(ISD::BITCAST);
1676 setTargetDAGCombine(ISD::VSELECT);
1677 setTargetDAGCombine(ISD::SELECT);
1678 setTargetDAGCombine(ISD::SHL);
1679 setTargetDAGCombine(ISD::SRA);
1680 setTargetDAGCombine(ISD::SRL);
1681 setTargetDAGCombine(ISD::OR);
1682 setTargetDAGCombine(ISD::AND);
1683 setTargetDAGCombine(ISD::ADD);
1684 setTargetDAGCombine(ISD::FADD);
1685 setTargetDAGCombine(ISD::FSUB);
1686 setTargetDAGCombine(ISD::FNEG);
1687 setTargetDAGCombine(ISD::FMA);
1688 setTargetDAGCombine(ISD::FMINNUM);
1689 setTargetDAGCombine(ISD::FMAXNUM);
1690 setTargetDAGCombine(ISD::SUB);
1691 setTargetDAGCombine(ISD::LOAD);
1692 setTargetDAGCombine(ISD::MLOAD);
1693 setTargetDAGCombine(ISD::STORE);
1694 setTargetDAGCombine(ISD::MSTORE);
1695 setTargetDAGCombine(ISD::TRUNCATE);
1696 setTargetDAGCombine(ISD::ZERO_EXTEND);
1697 setTargetDAGCombine(ISD::ANY_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND);
1699 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1700 setTargetDAGCombine(ISD::SINT_TO_FP);
1701 setTargetDAGCombine(ISD::UINT_TO_FP);
1702 setTargetDAGCombine(ISD::SETCC);
1703 setTargetDAGCombine(ISD::MUL);
1704 setTargetDAGCombine(ISD::XOR);
1705 setTargetDAGCombine(ISD::MSCATTER);
1706 setTargetDAGCombine(ISD::MGATHER);
1708 computeRegisterProperties(Subtarget.getRegisterInfo());
1710 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1711 MaxStoresPerMemsetOptSize = 8;
1712 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1713 MaxStoresPerMemcpyOptSize = 4;
1714 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1715 MaxStoresPerMemmoveOptSize = 4;
1716 setPrefLoopAlignment(4); // 2^4 bytes.
1718 // An out-of-order CPU can speculatively execute past a predictable branch,
1719 // but a conditional move could be stalled by an expensive earlier operation.
1720 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1721 EnableExtLdPromotion = true;
1722 setPrefFunctionAlignment(4); // 2^4 bytes.
1724 verifyIntrinsicTables();
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734 if (ExperimentalVectorWideningLegalization &&
1735 VT.getVectorNumElements() != 1 &&
1736 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737 return TypeWidenVector;
1739 return TargetLoweringBase::getPreferredVectorAction(VT);
1742 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1743 LLVMContext& Context,
1746 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1748 if (VT.isSimple()) {
1749 MVT VVT = VT.getSimpleVT();
1750 const unsigned NumElts = VVT.getVectorNumElements();
1751 MVT EltVT = VVT.getVectorElementType();
1752 if (VVT.is512BitVector()) {
1753 if (Subtarget.hasAVX512())
1754 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1755 EltVT == MVT::f32 || EltVT == MVT::f64)
1757 case 8: return MVT::v8i1;
1758 case 16: return MVT::v16i1;
1760 if (Subtarget.hasBWI())
1761 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1763 case 32: return MVT::v32i1;
1764 case 64: return MVT::v64i1;
1768 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1769 return MVT::getVectorVT(MVT::i1, NumElts);
1771 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1772 EVT LegalVT = getTypeToTransformTo(Context, VT);
1773 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1776 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1778 case 2: return MVT::v2i1;
1779 case 4: return MVT::v4i1;
1780 case 8: return MVT::v8i1;
1784 return VT.changeVectorElementTypeToInteger();
1787 /// Helper for getByValTypeAlignment to determine
1788 /// the desired ByVal argument alignment.
1789 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1792 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1793 if (VTy->getBitWidth() == 128)
1795 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1796 unsigned EltAlign = 0;
1797 getMaxByValAlign(ATy->getElementType(), EltAlign);
1798 if (EltAlign > MaxAlign)
1799 MaxAlign = EltAlign;
1800 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1801 for (auto *EltTy : STy->elements()) {
1802 unsigned EltAlign = 0;
1803 getMaxByValAlign(EltTy, EltAlign);
1804 if (EltAlign > MaxAlign)
1805 MaxAlign = EltAlign;
1812 /// Return the desired alignment for ByVal aggregate
1813 /// function arguments in the caller parameter area. For X86, aggregates
1814 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1815 /// are at 4-byte boundaries.
1816 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1817 const DataLayout &DL) const {
1818 if (Subtarget.is64Bit()) {
1819 // Max of 8 and alignment of type.
1820 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1827 if (Subtarget.hasSSE1())
1828 getMaxByValAlign(Ty, Align);
1832 /// Returns the target specific optimal type for load
1833 /// and store operations as a result of memset, memcpy, and memmove
1834 /// lowering. If DstAlign is zero that means it's safe to destination
1835 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1836 /// means there isn't a need to check it against alignment requirement,
1837 /// probably because the source does not need to be loaded. If 'IsMemset' is
1838 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1839 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1840 /// source is constant so it does not need to be loaded.
1841 /// It returns EVT::Other if the type should be determined using generic
1842 /// target-independent logic.
1844 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1845 unsigned DstAlign, unsigned SrcAlign,
1846 bool IsMemset, bool ZeroMemset,
1848 MachineFunction &MF) const {
1849 const Function *F = MF.getFunction();
1850 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1852 (!Subtarget.isUnalignedMem16Slow() ||
1853 ((DstAlign == 0 || DstAlign >= 16) &&
1854 (SrcAlign == 0 || SrcAlign >= 16)))) {
1855 // FIXME: Check if unaligned 32-byte accesses are slow.
1856 if (Size >= 32 && Subtarget.hasAVX()) {
1857 // Although this isn't a well-supported type for AVX1, we'll let
1858 // legalization and shuffle lowering produce the optimal codegen. If we
1859 // choose an optimal type with a vector element larger than a byte,
1860 // getMemsetStores() may create an intermediate splat (using an integer
1861 // multiply) before we splat as a vector.
1864 if (Subtarget.hasSSE2())
1866 // TODO: Can SSE1 handle a byte vector?
1867 if (Subtarget.hasSSE1())
1869 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1870 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1871 // Do not use f64 to lower memcpy if source is string constant. It's
1872 // better to use i32 to avoid the loads.
1873 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1874 // The gymnastics of splatting a byte value into an XMM register and then
1875 // only using 8-byte stores (because this is a CPU with slow unaligned
1876 // 16-byte accesses) makes that a loser.
1880 // This is a compromise. If we reach here, unaligned accesses may be slow on
1881 // this target. However, creating smaller, aligned accesses could be even
1882 // slower and would certainly be a lot more code.
1883 if (Subtarget.is64Bit() && Size >= 8)
1888 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1890 return X86ScalarSSEf32;
1891 else if (VT == MVT::f64)
1892 return X86ScalarSSEf64;
1897 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1902 switch (VT.getSizeInBits()) {
1904 // 8-byte and under are always assumed to be fast.
1908 *Fast = !Subtarget.isUnalignedMem16Slow();
1911 *Fast = !Subtarget.isUnalignedMem32Slow();
1913 // TODO: What about AVX-512 (512-bit) accesses?
1916 // Misaligned accesses of any size are always allowed.
1920 /// Return the entry encoding for a jump table in the
1921 /// current function. The returned value is a member of the
1922 /// MachineJumpTableInfo::JTEntryKind enum.
1923 unsigned X86TargetLowering::getJumpTableEncoding() const {
1924 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1926 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1927 return MachineJumpTableInfo::EK_Custom32;
1929 // Otherwise, use the normal jump table encoding heuristics.
1930 return TargetLowering::getJumpTableEncoding();
1933 bool X86TargetLowering::useSoftFloat() const {
1934 return Subtarget.useSoftFloat();
1938 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1939 const MachineBasicBlock *MBB,
1940 unsigned uid,MCContext &Ctx) const{
1941 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1942 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1944 return MCSymbolRefExpr::create(MBB->getSymbol(),
1945 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1948 /// Returns relocation base for the given PIC jumptable.
1949 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1950 SelectionDAG &DAG) const {
1951 if (!Subtarget.is64Bit())
1952 // This doesn't have SDLoc associated with it, but is not really the
1953 // same as a Register.
1954 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1955 getPointerTy(DAG.getDataLayout()));
1959 /// This returns the relocation base for the given PIC jumptable,
1960 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1961 const MCExpr *X86TargetLowering::
1962 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1963 MCContext &Ctx) const {
1964 // X86-64 uses RIP relative addressing based on the jump table label.
1965 if (Subtarget.isPICStyleRIPRel())
1966 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1968 // Otherwise, the reference is relative to the PIC base.
1969 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1972 std::pair<const TargetRegisterClass *, uint8_t>
1973 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1975 const TargetRegisterClass *RRC = nullptr;
1977 switch (VT.SimpleTy) {
1979 return TargetLowering::findRepresentativeClass(TRI, VT);
1980 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1981 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1984 RRC = &X86::VR64RegClass;
1986 case MVT::f32: case MVT::f64:
1987 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1988 case MVT::v4f32: case MVT::v2f64:
1989 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1990 case MVT::v8f32: case MVT::v4f64:
1991 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1992 case MVT::v16f32: case MVT::v8f64:
1993 RRC = &X86::VR128XRegClass;
1996 return std::make_pair(RRC, Cost);
1999 unsigned X86TargetLowering::getAddressSpace() const {
2000 if (Subtarget.is64Bit())
2001 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2005 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2006 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
2007 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
2008 if (!Subtarget.isTargetGlibc())
2009 return TargetLowering::getIRStackGuard(IRB);
2011 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
2013 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2014 unsigned AddressSpace = getAddressSpace();
2015 return ConstantExpr::getIntToPtr(
2016 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2017 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2020 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2021 // MSVC CRT provides functionalities for stack protection.
2022 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2023 // MSVC CRT has a global variable holding security cookie.
2024 M.getOrInsertGlobal("__security_cookie",
2025 Type::getInt8PtrTy(M.getContext()));
2027 // MSVC CRT has a function to validate security cookie.
2028 auto *SecurityCheckCookie = cast<Function>(
2029 M.getOrInsertFunction("__security_check_cookie",
2030 Type::getVoidTy(M.getContext()),
2031 Type::getInt8PtrTy(M.getContext()), nullptr));
2032 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2033 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2036 // glibc has a special slot for the stack guard.
2037 if (Subtarget.isTargetGlibc())
2039 TargetLowering::insertSSPDeclarations(M);
2042 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2043 // MSVC CRT has a global variable holding security cookie.
2044 if (Subtarget.getTargetTriple().isOSMSVCRT())
2045 return M.getGlobalVariable("__security_cookie");
2046 return TargetLowering::getSDagStackGuard(M);
2049 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2050 // MSVC CRT has a function to validate security cookie.
2051 if (Subtarget.getTargetTriple().isOSMSVCRT())
2052 return M.getFunction("__security_check_cookie");
2053 return TargetLowering::getSSPStackGuardCheck(M);
2056 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2057 if (Subtarget.getTargetTriple().isOSContiki())
2058 return getDefaultSafeStackPointerLocation(IRB, false);
2060 if (!Subtarget.isTargetAndroid())
2061 return TargetLowering::getSafeStackPointerLocation(IRB);
2063 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2064 // definition of TLS_SLOT_SAFESTACK in
2065 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2066 unsigned AddressSpace, Offset;
2068 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2070 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2071 AddressSpace = getAddressSpace();
2072 return ConstantExpr::getIntToPtr(
2073 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2074 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2077 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2078 unsigned DestAS) const {
2079 assert(SrcAS != DestAS && "Expected different address spaces!");
2081 return SrcAS < 256 && DestAS < 256;
2084 //===----------------------------------------------------------------------===//
2085 // Return Value Calling Convention Implementation
2086 //===----------------------------------------------------------------------===//
2088 #include "X86GenCallingConv.inc"
2090 bool X86TargetLowering::CanLowerReturn(
2091 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2092 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2093 SmallVector<CCValAssign, 16> RVLocs;
2094 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2095 return CCInfo.CheckReturn(Outs, RetCC_X86);
2098 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2099 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2103 /// Lowers masks values (v*i1) to the local register values
2104 /// \returns DAG node after lowering to register type
2105 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2106 const SDLoc &Dl, SelectionDAG &DAG) {
2107 EVT ValVT = ValArg.getValueType();
2109 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2110 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2111 // Two stage lowering might be required
2112 // bitcast: v8i1 -> i8 / v16i1 -> i16
2113 // anyextend: i8 -> i32 / i16 -> i32
2114 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2115 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2116 if (ValLoc == MVT::i32)
2117 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2119 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2120 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2121 // One stage lowering is required
2122 // bitcast: v32i1 -> i32 / v64i1 -> i64
2123 return DAG.getBitcast(ValLoc, ValArg);
2125 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2128 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2129 static void Passv64i1ArgInRegs(
2130 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2131 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2132 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2133 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2134 "Expected AVX512BW or AVX512BMI target!");
2135 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2136 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2137 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2138 "The value should reside in two registers");
2140 // Before splitting the value we cast it to i64
2141 Arg = DAG.getBitcast(MVT::i64, Arg);
2143 // Splitting the value into two i32 types
2145 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2146 DAG.getConstant(0, Dl, MVT::i32));
2147 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2148 DAG.getConstant(1, Dl, MVT::i32));
2150 // Attach the two i32 types into corresponding registers
2151 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2152 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2156 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2158 const SmallVectorImpl<ISD::OutputArg> &Outs,
2159 const SmallVectorImpl<SDValue> &OutVals,
2160 const SDLoc &dl, SelectionDAG &DAG) const {
2161 MachineFunction &MF = DAG.getMachineFunction();
2162 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2164 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2165 report_fatal_error("X86 interrupts may not return any value");
2167 SmallVector<CCValAssign, 16> RVLocs;
2168 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2169 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2172 SmallVector<SDValue, 6> RetOps;
2173 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2174 // Operand #1 = Bytes To Pop
2175 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2178 // Copy the result values into the output registers.
2179 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2181 CCValAssign &VA = RVLocs[I];
2182 assert(VA.isRegLoc() && "Can only return in registers!");
2183 SDValue ValToCopy = OutVals[OutsIndex];
2184 EVT ValVT = ValToCopy.getValueType();
2186 // Promote values to the appropriate types.
2187 if (VA.getLocInfo() == CCValAssign::SExt)
2188 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2189 else if (VA.getLocInfo() == CCValAssign::ZExt)
2190 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2191 else if (VA.getLocInfo() == CCValAssign::AExt) {
2192 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2193 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2195 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2197 else if (VA.getLocInfo() == CCValAssign::BCvt)
2198 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2200 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2201 "Unexpected FP-extend for return value.");
2203 // If this is x86-64, and we disabled SSE, we can't return FP values,
2204 // or SSE or MMX vectors.
2205 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2206 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2207 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2208 report_fatal_error("SSE register return with SSE disabled");
2210 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2211 // llvm-gcc has never done it right and no one has noticed, so this
2212 // should be OK for now.
2213 if (ValVT == MVT::f64 &&
2214 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2215 report_fatal_error("SSE2 register return with SSE2 disabled");
2217 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2218 // the RET instruction and handled by the FP Stackifier.
2219 if (VA.getLocReg() == X86::FP0 ||
2220 VA.getLocReg() == X86::FP1) {
2221 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2222 // change the value to the FP stack register class.
2223 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2224 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2225 RetOps.push_back(ValToCopy);
2226 // Don't emit a copytoreg.
2230 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2231 // which is returned in RAX / RDX.
2232 if (Subtarget.is64Bit()) {
2233 if (ValVT == MVT::x86mmx) {
2234 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2235 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2236 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2238 // If we don't have SSE2 available, convert to v4f32 so the generated
2239 // register is legal.
2240 if (!Subtarget.hasSSE2())
2241 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2246 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2248 if (VA.needsCustom()) {
2249 assert(VA.getValVT() == MVT::v64i1 &&
2250 "Currently the only custom case is when we split v64i1 to 2 regs");
2252 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2255 assert(2 == RegsToPass.size() &&
2256 "Expecting two registers after Pass64BitArgInRegs");
2258 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2261 // Add nodes to the DAG and add the values into the RetOps list
2262 for (auto &Reg : RegsToPass) {
2263 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2264 Flag = Chain.getValue(1);
2265 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2269 // Swift calling convention does not require we copy the sret argument
2270 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2272 // All x86 ABIs require that for returning structs by value we copy
2273 // the sret argument into %rax/%eax (depending on ABI) for the return.
2274 // We saved the argument into a virtual register in the entry block,
2275 // so now we copy the value out and into %rax/%eax.
2277 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2278 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2279 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2280 // either case FuncInfo->setSRetReturnReg() will have been called.
2281 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2282 // When we have both sret and another return value, we should use the
2283 // original Chain stored in RetOps[0], instead of the current Chain updated
2284 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2286 // For the case of sret and another return value, we have
2287 // Chain_0 at the function entry
2288 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2289 // If we use Chain_1 in getCopyFromReg, we will have
2290 // Val = getCopyFromReg(Chain_1)
2291 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2293 // getCopyToReg(Chain_0) will be glued together with
2294 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2295 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2296 // Data dependency from Unit B to Unit A due to usage of Val in
2297 // getCopyToReg(Chain_1, Val)
2298 // Chain dependency from Unit A to Unit B
2300 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2301 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2302 getPointerTy(MF.getDataLayout()));
2305 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2306 X86::RAX : X86::EAX;
2307 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2308 Flag = Chain.getValue(1);
2310 // RAX/EAX now acts like a return value.
2312 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2315 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2316 const MCPhysReg *I =
2317 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2320 if (X86::GR64RegClass.contains(*I))
2321 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2323 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2327 RetOps[0] = Chain; // Update chain.
2329 // Add the flag if we have it.
2331 RetOps.push_back(Flag);
2333 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2334 if (CallConv == CallingConv::X86_INTR)
2335 opcode = X86ISD::IRET;
2336 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2339 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2340 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2343 SDValue TCChain = Chain;
2344 SDNode *Copy = *N->use_begin();
2345 if (Copy->getOpcode() == ISD::CopyToReg) {
2346 // If the copy has a glue operand, we conservatively assume it isn't safe to
2347 // perform a tail call.
2348 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2350 TCChain = Copy->getOperand(0);
2351 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2354 bool HasRet = false;
2355 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2357 if (UI->getOpcode() != X86ISD::RET_FLAG)
2359 // If we are returning more than one value, we can definitely
2360 // not make a tail call see PR19530
2361 if (UI->getNumOperands() > 4)
2363 if (UI->getNumOperands() == 4 &&
2364 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2376 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2377 ISD::NodeType ExtendKind) const {
2378 MVT ReturnMVT = MVT::i32;
2380 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2381 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2382 // The ABI does not require i1, i8 or i16 to be extended.
2384 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2385 // always extending i8/i16 return values, so keep doing that for now.
2387 ReturnMVT = MVT::i8;
2390 EVT MinVT = getRegisterType(Context, ReturnMVT);
2391 return VT.bitsLT(MinVT) ? MinVT : VT;
2394 /// Reads two 32 bit registers and creates a 64 bit mask value.
2395 /// \param VA The current 32 bit value that need to be assigned.
2396 /// \param NextVA The next 32 bit value that need to be assigned.
2397 /// \param Root The parent DAG node.
2398 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2399 /// glue purposes. In the case the DAG is already using
2400 /// physical register instead of virtual, we should glue
2401 /// our new SDValue to InFlag SDvalue.
2402 /// \return a new SDvalue of size 64bit.
2403 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2404 SDValue &Root, SelectionDAG &DAG,
2405 const SDLoc &Dl, const X86Subtarget &Subtarget,
2406 SDValue *InFlag = nullptr) {
2407 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2408 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2409 assert(VA.getValVT() == MVT::v64i1 &&
2410 "Expecting first location of 64 bit width type");
2411 assert(NextVA.getValVT() == VA.getValVT() &&
2412 "The locations should have the same type");
2413 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2414 "The values should reside in two registers");
2418 SDValue ArgValueLo, ArgValueHi;
2420 MachineFunction &MF = DAG.getMachineFunction();
2421 const TargetRegisterClass *RC = &X86::GR32RegClass;
2423 // Read a 32 bit value from the registers
2424 if (nullptr == InFlag) {
2425 // When no physical register is present,
2426 // create an intermediate virtual register
2427 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2428 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2429 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2430 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2432 // When a physical register is available read the value from it and glue
2433 // the reads together.
2435 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2436 *InFlag = ArgValueLo.getValue(2);
2438 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2439 *InFlag = ArgValueHi.getValue(2);
2442 // Convert the i32 type into v32i1 type
2443 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2445 // Convert the i32 type into v32i1 type
2446 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2448 // Concantenate the two values together
2449 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2452 /// The function will lower a register of various sizes (8/16/32/64)
2453 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2454 /// \returns a DAG node contains the operand after lowering to mask type.
2455 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2456 const EVT &ValLoc, const SDLoc &Dl,
2457 SelectionDAG &DAG) {
2458 SDValue ValReturned = ValArg;
2460 if (ValVT == MVT::v64i1) {
2461 // In 32 bit machine, this case is handled by getv64i1Argument
2462 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2463 // In 64 bit machine, There is no need to truncate the value only bitcast
2466 switch (ValVT.getSimpleVT().SimpleTy) {
2477 llvm_unreachable("Expecting a vector of i1 types");
2480 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2483 return DAG.getBitcast(ValVT, ValReturned);
2486 /// Lower the result values of a call into the
2487 /// appropriate copies out of appropriate physical registers.
2489 SDValue X86TargetLowering::LowerCallResult(
2490 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2491 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2492 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2494 // Assign locations to each value returned by this call.
2495 SmallVector<CCValAssign, 16> RVLocs;
2496 bool Is64Bit = Subtarget.is64Bit();
2497 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2499 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2501 // Copy all of the result registers out of their specified physreg.
2502 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2504 CCValAssign &VA = RVLocs[I];
2505 EVT CopyVT = VA.getLocVT();
2507 // If this is x86-64, and we disabled SSE, we can't return FP values
2508 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2509 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2510 report_fatal_error("SSE register return with SSE disabled");
2513 // If we prefer to use the value in xmm registers, copy it out as f80 and
2514 // use a truncate to move it from fp stack reg to xmm reg.
2515 bool RoundAfterCopy = false;
2516 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2517 isScalarFPTypeInSSEReg(VA.getValVT())) {
2518 if (!Subtarget.hasX87())
2519 report_fatal_error("X87 register return with X87 disabled");
2521 RoundAfterCopy = (CopyVT != VA.getLocVT());
2525 if (VA.needsCustom()) {
2526 assert(VA.getValVT() == MVT::v64i1 &&
2527 "Currently the only custom case is when we split v64i1 to 2 regs");
2529 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2531 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2533 Val = Chain.getValue(0);
2534 InFlag = Chain.getValue(2);
2538 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2539 // This truncation won't change the value.
2540 DAG.getIntPtrConstant(1, dl));
2542 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2543 if (VA.getValVT().isVector() &&
2544 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2545 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2546 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2547 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2549 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2552 InVals.push_back(Val);
2558 //===----------------------------------------------------------------------===//
2559 // C & StdCall & Fast Calling Convention implementation
2560 //===----------------------------------------------------------------------===//
2561 // StdCall calling convention seems to be standard for many Windows' API
2562 // routines and around. It differs from C calling convention just a little:
2563 // callee should clean up the stack, not caller. Symbols should be also
2564 // decorated in some fancy way :) It doesn't support any vector arguments.
2565 // For info on fast calling convention see Fast Calling Convention (tail call)
2566 // implementation LowerX86_32FastCCCallTo.
2568 /// CallIsStructReturn - Determines whether a call uses struct return
2570 enum StructReturnType {
2575 static StructReturnType
2576 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2578 return NotStructReturn;
2580 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2581 if (!Flags.isSRet())
2582 return NotStructReturn;
2583 if (Flags.isInReg() || IsMCU)
2584 return RegStructReturn;
2585 return StackStructReturn;
2588 /// Determines whether a function uses struct return semantics.
2589 static StructReturnType
2590 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2592 return NotStructReturn;
2594 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2595 if (!Flags.isSRet())
2596 return NotStructReturn;
2597 if (Flags.isInReg() || IsMCU)
2598 return RegStructReturn;
2599 return StackStructReturn;
2602 /// Make a copy of an aggregate at address specified by "Src" to address
2603 /// "Dst" with size and alignment information specified by the specific
2604 /// parameter attribute. The copy will be passed as a byval function parameter.
2605 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2606 SDValue Chain, ISD::ArgFlagsTy Flags,
2607 SelectionDAG &DAG, const SDLoc &dl) {
2608 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2610 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2611 /*isVolatile*/false, /*AlwaysInline=*/true,
2612 /*isTailCall*/false,
2613 MachinePointerInfo(), MachinePointerInfo());
2616 /// Return true if the calling convention is one that we can guarantee TCO for.
2617 static bool canGuaranteeTCO(CallingConv::ID CC) {
2618 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2619 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2620 CC == CallingConv::HHVM);
2623 /// Return true if we might ever do TCO for calls with this calling convention.
2624 static bool mayTailCallThisCC(CallingConv::ID CC) {
2626 // C calling conventions:
2627 case CallingConv::C:
2628 case CallingConv::X86_64_Win64:
2629 case CallingConv::X86_64_SysV:
2630 // Callee pop conventions:
2631 case CallingConv::X86_ThisCall:
2632 case CallingConv::X86_StdCall:
2633 case CallingConv::X86_VectorCall:
2634 case CallingConv::X86_FastCall:
2637 return canGuaranteeTCO(CC);
2641 /// Return true if the function is being made into a tailcall target by
2642 /// changing its ABI.
2643 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2644 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2647 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2649 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2650 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2654 CallingConv::ID CalleeCC = CS.getCallingConv();
2655 if (!mayTailCallThisCC(CalleeCC))
2662 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2663 const SmallVectorImpl<ISD::InputArg> &Ins,
2664 const SDLoc &dl, SelectionDAG &DAG,
2665 const CCValAssign &VA,
2666 MachineFrameInfo &MFI, unsigned i) const {
2667 // Create the nodes corresponding to a load from this parameter slot.
2668 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2669 bool AlwaysUseMutable = shouldGuaranteeTCO(
2670 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2671 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2674 // If value is passed by pointer we have address passed instead of the value
2675 // itself. No need to extend if the mask value and location share the same
2677 bool ExtendedInMem =
2678 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2679 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2681 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2682 ValVT = VA.getLocVT();
2684 ValVT = VA.getValVT();
2686 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2687 // taken by a return address.
2689 if (CallConv == CallingConv::X86_INTR) {
2690 const X86Subtarget& Subtarget =
2691 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2692 // X86 interrupts may take one or two arguments.
2693 // On the stack there will be no return address as in regular call.
2694 // Offset of last argument need to be set to -4/-8 bytes.
2695 // Where offset of the first argument out of two, should be set to 0 bytes.
2696 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2699 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2700 // changed with more analysis.
2701 // In case of tail call optimization mark all arguments mutable. Since they
2702 // could be overwritten by lowering of arguments in case of a tail call.
2703 if (Flags.isByVal()) {
2704 unsigned Bytes = Flags.getByValSize();
2705 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2706 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2707 // Adjust SP offset of interrupt parameter.
2708 if (CallConv == CallingConv::X86_INTR) {
2709 MFI.setObjectOffset(FI, Offset);
2711 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2713 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
2714 VA.getLocMemOffset(), isImmutable);
2716 // Set SExt or ZExt flag.
2717 if (VA.getLocInfo() == CCValAssign::ZExt) {
2718 MFI.setObjectZExt(FI, true);
2719 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2720 MFI.setObjectSExt(FI, true);
2723 // Adjust SP offset of interrupt parameter.
2724 if (CallConv == CallingConv::X86_INTR) {
2725 MFI.setObjectOffset(FI, Offset);
2728 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2729 SDValue Val = DAG.getLoad(
2730 ValVT, dl, Chain, FIN,
2731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2732 return ExtendedInMem ?
2733 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2737 // FIXME: Get this from tablegen.
2738 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2739 const X86Subtarget &Subtarget) {
2740 assert(Subtarget.is64Bit());
2742 if (Subtarget.isCallingConvWin64(CallConv)) {
2743 static const MCPhysReg GPR64ArgRegsWin64[] = {
2744 X86::RCX, X86::RDX, X86::R8, X86::R9
2746 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2749 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2750 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2752 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2755 // FIXME: Get this from tablegen.
2756 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2757 CallingConv::ID CallConv,
2758 const X86Subtarget &Subtarget) {
2759 assert(Subtarget.is64Bit());
2760 if (Subtarget.isCallingConvWin64(CallConv)) {
2761 // The XMM registers which might contain var arg parameters are shadowed
2762 // in their paired GPR. So we only need to save the GPR to their home
2764 // TODO: __vectorcall will change this.
2768 const Function *Fn = MF.getFunction();
2769 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2770 bool isSoftFloat = Subtarget.useSoftFloat();
2771 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2772 "SSE register cannot be used when SSE is disabled!");
2773 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2774 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2778 static const MCPhysReg XMMArgRegs64Bit[] = {
2779 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2780 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2782 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2785 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2786 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2787 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2788 return A.getValNo() < B.getValNo();
2792 SDValue X86TargetLowering::LowerFormalArguments(
2793 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2794 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2795 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2796 MachineFunction &MF = DAG.getMachineFunction();
2797 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2798 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2800 const Function *Fn = MF.getFunction();
2801 if (Fn->hasExternalLinkage() &&
2802 Subtarget.isTargetCygMing() &&
2803 Fn->getName() == "main")
2804 FuncInfo->setForceFramePointer(true);
2806 MachineFrameInfo &MFI = MF.getFrameInfo();
2807 bool Is64Bit = Subtarget.is64Bit();
2808 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2811 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2812 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2814 if (CallConv == CallingConv::X86_INTR) {
2815 bool isLegal = Ins.size() == 1 ||
2816 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2817 (!Is64Bit && Ins[1].VT == MVT::i32)));
2819 report_fatal_error("X86 interrupts may take one or two arguments");
2822 // Assign locations to all of the incoming arguments.
2823 SmallVector<CCValAssign, 16> ArgLocs;
2824 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2826 // Allocate shadow area for Win64.
2828 CCInfo.AllocateStack(32, 8);
2830 CCInfo.AnalyzeArguments(Ins, CC_X86);
2832 // In vectorcall calling convention a second pass is required for the HVA
2834 if (CallingConv::X86_VectorCall == CallConv) {
2835 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2838 // The next loop assumes that the locations are in the same order of the
2840 if (!isSortedByValueNo(ArgLocs))
2841 llvm_unreachable("Argument Location list must be sorted before lowering");
2844 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2846 assert(InsIndex < Ins.size() && "Invalid Ins index");
2847 CCValAssign &VA = ArgLocs[I];
2849 if (VA.isRegLoc()) {
2850 EVT RegVT = VA.getLocVT();
2851 if (VA.needsCustom()) {
2853 VA.getValVT() == MVT::v64i1 &&
2854 "Currently the only custom case is when we split v64i1 to 2 regs");
2856 // v64i1 values, in regcall calling convention, that are
2857 // compiled to 32 bit arch, are splited up into two registers.
2859 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2861 const TargetRegisterClass *RC;
2862 if (RegVT == MVT::i32)
2863 RC = &X86::GR32RegClass;
2864 else if (Is64Bit && RegVT == MVT::i64)
2865 RC = &X86::GR64RegClass;
2866 else if (RegVT == MVT::f32)
2867 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2868 else if (RegVT == MVT::f64)
2869 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2870 else if (RegVT == MVT::f80)
2871 RC = &X86::RFP80RegClass;
2872 else if (RegVT == MVT::f128)
2873 RC = &X86::FR128RegClass;
2874 else if (RegVT.is512BitVector())
2875 RC = &X86::VR512RegClass;
2876 else if (RegVT.is256BitVector())
2877 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2878 else if (RegVT.is128BitVector())
2879 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2880 else if (RegVT == MVT::x86mmx)
2881 RC = &X86::VR64RegClass;
2882 else if (RegVT == MVT::i1)
2883 RC = &X86::VK1RegClass;
2884 else if (RegVT == MVT::v8i1)
2885 RC = &X86::VK8RegClass;
2886 else if (RegVT == MVT::v16i1)
2887 RC = &X86::VK16RegClass;
2888 else if (RegVT == MVT::v32i1)
2889 RC = &X86::VK32RegClass;
2890 else if (RegVT == MVT::v64i1)
2891 RC = &X86::VK64RegClass;
2893 llvm_unreachable("Unknown argument type!");
2895 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2896 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2899 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2900 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2902 if (VA.getLocInfo() == CCValAssign::SExt)
2903 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2904 DAG.getValueType(VA.getValVT()));
2905 else if (VA.getLocInfo() == CCValAssign::ZExt)
2906 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2907 DAG.getValueType(VA.getValVT()));
2908 else if (VA.getLocInfo() == CCValAssign::BCvt)
2909 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2911 if (VA.isExtInLoc()) {
2912 // Handle MMX values passed in XMM regs.
2913 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2914 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2915 else if (VA.getValVT().isVector() &&
2916 VA.getValVT().getScalarType() == MVT::i1 &&
2917 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2918 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2919 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2920 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2922 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2925 assert(VA.isMemLoc());
2927 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
2930 // If value is passed via pointer - do a load.
2931 if (VA.getLocInfo() == CCValAssign::Indirect)
2933 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2935 InVals.push_back(ArgValue);
2938 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
2939 // Swift calling convention does not require we copy the sret argument
2940 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2941 if (CallConv == CallingConv::Swift)
2944 // All x86 ABIs require that for returning structs by value we copy the
2945 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2946 // the argument into a virtual register so that we can access it from the
2948 if (Ins[I].Flags.isSRet()) {
2949 unsigned Reg = FuncInfo->getSRetReturnReg();
2951 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2952 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2953 FuncInfo->setSRetReturnReg(Reg);
2955 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
2956 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2961 unsigned StackSize = CCInfo.getNextStackOffset();
2962 // Align stack specially for tail calls.
2963 if (shouldGuaranteeTCO(CallConv,
2964 MF.getTarget().Options.GuaranteedTailCallOpt))
2965 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2967 // If the function takes variable number of arguments, make a frame index for
2968 // the start of the first vararg value... for expansion of llvm.va_start. We
2969 // can skip this if there are no va_start calls.
2970 if (MFI.hasVAStart() &&
2971 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2972 CallConv != CallingConv::X86_ThisCall))) {
2973 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
2976 // Figure out if XMM registers are in use.
2977 assert(!(Subtarget.useSoftFloat() &&
2978 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2979 "SSE register cannot be used when SSE is disabled!");
2981 // 64-bit calling conventions support varargs and register parameters, so we
2982 // have to do extra work to spill them in the prologue.
2983 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
2984 // Find the first unallocated argument registers.
2985 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2986 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2987 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2988 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2989 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2990 "SSE register cannot be used when SSE is disabled!");
2992 // Gather all the live in physical registers.
2993 SmallVector<SDValue, 6> LiveGPRs;
2994 SmallVector<SDValue, 8> LiveXMMRegs;
2996 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2997 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2999 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3001 if (!ArgXMMs.empty()) {
3002 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3003 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3004 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3005 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3006 LiveXMMRegs.push_back(
3007 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3012 // Get to the caller-allocated home save location. Add 8 to account
3013 // for the return address.
3014 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3015 FuncInfo->setRegSaveFrameIndex(
3016 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3017 // Fixup to set vararg frame on shadow area (4 x i64).
3019 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3021 // For X86-64, if there are vararg parameters that are passed via
3022 // registers, then we must store them to their spots on the stack so
3023 // they may be loaded by dereferencing the result of va_next.
3024 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3025 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3026 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3027 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3030 // Store the integer parameter registers.
3031 SmallVector<SDValue, 8> MemOps;
3032 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3033 getPointerTy(DAG.getDataLayout()));
3034 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3035 for (SDValue Val : LiveGPRs) {
3036 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3037 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3039 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3040 MachinePointerInfo::getFixedStack(
3041 DAG.getMachineFunction(),
3042 FuncInfo->getRegSaveFrameIndex(), Offset));
3043 MemOps.push_back(Store);
3047 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3048 // Now store the XMM (fp + vector) parameter registers.
3049 SmallVector<SDValue, 12> SaveXMMOps;
3050 SaveXMMOps.push_back(Chain);
3051 SaveXMMOps.push_back(ALVal);
3052 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3053 FuncInfo->getRegSaveFrameIndex(), dl));
3054 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3055 FuncInfo->getVarArgsFPOffset(), dl));
3056 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3058 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3059 MVT::Other, SaveXMMOps));
3062 if (!MemOps.empty())
3063 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3066 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3067 // Find the largest legal vector type.
3068 MVT VecVT = MVT::Other;
3069 // FIXME: Only some x86_32 calling conventions support AVX512.
3070 if (Subtarget.hasAVX512() &&
3071 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3072 CallConv == CallingConv::Intel_OCL_BI)))
3073 VecVT = MVT::v16f32;
3074 else if (Subtarget.hasAVX())
3076 else if (Subtarget.hasSSE2())
3079 // We forward some GPRs and some vector types.
3080 SmallVector<MVT, 2> RegParmTypes;
3081 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3082 RegParmTypes.push_back(IntVT);
3083 if (VecVT != MVT::Other)
3084 RegParmTypes.push_back(VecVT);
3086 // Compute the set of forwarded registers. The rest are scratch.
3087 SmallVectorImpl<ForwardedRegister> &Forwards =
3088 FuncInfo->getForwardedMustTailRegParms();
3089 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3091 // Conservatively forward AL on x86_64, since it might be used for varargs.
3092 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3093 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3094 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3097 // Copy all forwards from physical to virtual registers.
3098 for (ForwardedRegister &F : Forwards) {
3099 // FIXME: Can we use a less constrained schedule?
3100 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3101 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3102 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3106 // Some CCs need callee pop.
3107 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3108 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3109 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3110 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3111 // X86 interrupts must pop the error code if present
3112 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
3114 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3115 // If this is an sret function, the return should pop the hidden pointer.
3116 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3117 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3118 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3119 FuncInfo->setBytesToPopOnReturn(4);
3123 // RegSaveFrameIndex is X86-64 only.
3124 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3125 if (CallConv == CallingConv::X86_FastCall ||
3126 CallConv == CallingConv::X86_ThisCall)
3127 // fastcc functions can't have varargs.
3128 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3131 FuncInfo->setArgumentStackSize(StackSize);
3133 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3134 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3135 if (Personality == EHPersonality::CoreCLR) {
3137 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3138 // that we'd prefer this slot be allocated towards the bottom of the frame
3139 // (i.e. near the stack pointer after allocating the frame). Every
3140 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3141 // offset from the bottom of this and each funclet's frame must be the
3142 // same, so the size of funclets' (mostly empty) frames is dictated by
3143 // how far this slot is from the bottom (since they allocate just enough
3144 // space to accommodate holding this slot at the correct offset).
3145 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3146 EHInfo->PSPSymFrameIdx = PSPSymFI;
3153 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3154 SDValue Arg, const SDLoc &dl,
3156 const CCValAssign &VA,
3157 ISD::ArgFlagsTy Flags) const {
3158 unsigned LocMemOffset = VA.getLocMemOffset();
3159 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3160 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3162 if (Flags.isByVal())
3163 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3165 return DAG.getStore(
3166 Chain, dl, Arg, PtrOff,
3167 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3170 /// Emit a load of return address if tail call
3171 /// optimization is performed and it is required.
3172 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3173 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3174 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3175 // Adjust the Return address stack slot.
3176 EVT VT = getPointerTy(DAG.getDataLayout());
3177 OutRetAddr = getReturnAddressFrameIndex(DAG);
3179 // Load the "old" Return address.
3180 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3181 return SDValue(OutRetAddr.getNode(), 1);
3184 /// Emit a store of the return address if tail call
3185 /// optimization is performed and it is required (FPDiff!=0).
3186 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3187 SDValue Chain, SDValue RetAddrFrIdx,
3188 EVT PtrVT, unsigned SlotSize,
3189 int FPDiff, const SDLoc &dl) {
3190 // Store the return address to the appropriate stack slot.
3191 if (!FPDiff) return Chain;
3192 // Calculate the new stack slot for the return address.
3193 int NewReturnAddrFI =
3194 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3196 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3197 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3198 MachinePointerInfo::getFixedStack(
3199 DAG.getMachineFunction(), NewReturnAddrFI));
3203 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3204 /// operation of specified width.
3205 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3207 unsigned NumElems = VT.getVectorNumElements();
3208 SmallVector<int, 8> Mask;
3209 Mask.push_back(NumElems);
3210 for (unsigned i = 1; i != NumElems; ++i)
3212 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3216 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3217 SmallVectorImpl<SDValue> &InVals) const {
3218 SelectionDAG &DAG = CLI.DAG;
3220 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3221 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3222 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3223 SDValue Chain = CLI.Chain;
3224 SDValue Callee = CLI.Callee;
3225 CallingConv::ID CallConv = CLI.CallConv;
3226 bool &isTailCall = CLI.IsTailCall;
3227 bool isVarArg = CLI.IsVarArg;
3229 MachineFunction &MF = DAG.getMachineFunction();
3230 bool Is64Bit = Subtarget.is64Bit();
3231 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3232 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3233 bool IsSibcall = false;
3234 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3235 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3237 if (CallConv == CallingConv::X86_INTR)
3238 report_fatal_error("X86 interrupts may not be called directly");
3240 if (Attr.getValueAsString() == "true")
3243 if (Subtarget.isPICStyleGOT() &&
3244 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3245 // If we are using a GOT, disable tail calls to external symbols with
3246 // default visibility. Tail calling such a symbol requires using a GOT
3247 // relocation, which forces early binding of the symbol. This breaks code
3248 // that require lazy function symbol resolution. Using musttail or
3249 // GuaranteedTailCallOpt will override this.
3250 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3251 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3252 G->getGlobal()->hasDefaultVisibility()))
3256 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3258 // Force this to be a tail call. The verifier rules are enough to ensure
3259 // that we can lower this successfully without moving the return address
3262 } else if (isTailCall) {
3263 // Check if it's really possible to do a tail call.
3264 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3265 isVarArg, SR != NotStructReturn,
3266 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3267 Outs, OutVals, Ins, DAG);
3269 // Sibcalls are automatically detected tailcalls which do not require
3271 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3278 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3279 "Var args not supported with calling convention fastcc, ghc or hipe");
3281 // Analyze operands of the call, assigning locations to each operand.
3282 SmallVector<CCValAssign, 16> ArgLocs;
3283 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3285 // Allocate shadow area for Win64.
3287 CCInfo.AllocateStack(32, 8);
3289 CCInfo.AnalyzeArguments(Outs, CC_X86);
3291 // In vectorcall calling convention a second pass is required for the HVA
3293 if (CallingConv::X86_VectorCall == CallConv) {
3294 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3297 // Get a count of how many bytes are to be pushed on the stack.
3298 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3300 // This is a sibcall. The memory operands are available in caller's
3301 // own caller's stack.
3303 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3304 canGuaranteeTCO(CallConv))
3305 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3308 if (isTailCall && !IsSibcall && !IsMustTail) {
3309 // Lower arguments at fp - stackoffset + fpdiff.
3310 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3312 FPDiff = NumBytesCallerPushed - NumBytes;
3314 // Set the delta of movement of the returnaddr stackslot.
3315 // But only set if delta is greater than previous delta.
3316 if (FPDiff < X86Info->getTCReturnAddrDelta())
3317 X86Info->setTCReturnAddrDelta(FPDiff);
3320 unsigned NumBytesToPush = NumBytes;
3321 unsigned NumBytesToPop = NumBytes;
3323 // If we have an inalloca argument, all stack space has already been allocated
3324 // for us and be right at the top of the stack. We don't support multiple
3325 // arguments passed in memory when using inalloca.
3326 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3328 if (!ArgLocs.back().isMemLoc())
3329 report_fatal_error("cannot use inalloca attribute on a register "
3331 if (ArgLocs.back().getLocMemOffset() != 0)
3332 report_fatal_error("any parameter with the inalloca attribute must be "
3333 "the only memory argument");
3337 Chain = DAG.getCALLSEQ_START(
3338 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3340 SDValue RetAddrFrIdx;
3341 // Load return address for tail calls.
3342 if (isTailCall && FPDiff)
3343 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3344 Is64Bit, FPDiff, dl);
3346 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3347 SmallVector<SDValue, 8> MemOpChains;
3350 // The next loop assumes that the locations are in the same order of the
3352 if (!isSortedByValueNo(ArgLocs))
3353 llvm_unreachable("Argument Location list must be sorted before lowering");
3355 // Walk the register/memloc assignments, inserting copies/loads. In the case
3356 // of tail call optimization arguments are handle later.
3357 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3358 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3360 assert(OutIndex < Outs.size() && "Invalid Out index");
3361 // Skip inalloca arguments, they have already been written.
3362 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3363 if (Flags.isInAlloca())
3366 CCValAssign &VA = ArgLocs[I];
3367 EVT RegVT = VA.getLocVT();
3368 SDValue Arg = OutVals[OutIndex];
3369 bool isByVal = Flags.isByVal();
3371 // Promote the value if needed.
3372 switch (VA.getLocInfo()) {
3373 default: llvm_unreachable("Unknown loc info!");
3374 case CCValAssign::Full: break;
3375 case CCValAssign::SExt:
3376 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3378 case CCValAssign::ZExt:
3379 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3381 case CCValAssign::AExt:
3382 if (Arg.getValueType().isVector() &&
3383 Arg.getValueType().getVectorElementType() == MVT::i1)
3384 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3385 else if (RegVT.is128BitVector()) {
3386 // Special case: passing MMX values in XMM registers.
3387 Arg = DAG.getBitcast(MVT::i64, Arg);
3388 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3389 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3391 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3393 case CCValAssign::BCvt:
3394 Arg = DAG.getBitcast(RegVT, Arg);
3396 case CCValAssign::Indirect: {
3397 // Store the argument.
3398 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3399 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3400 Chain = DAG.getStore(
3401 Chain, dl, Arg, SpillSlot,
3402 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3408 if (VA.needsCustom()) {
3409 assert(VA.getValVT() == MVT::v64i1 &&
3410 "Currently the only custom case is when we split v64i1 to 2 regs");
3411 // Split v64i1 value into two registers
3412 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3414 } else if (VA.isRegLoc()) {
3415 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3416 if (isVarArg && IsWin64) {
3417 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3418 // shadow reg if callee is a varargs function.
3419 unsigned ShadowReg = 0;
3420 switch (VA.getLocReg()) {
3421 case X86::XMM0: ShadowReg = X86::RCX; break;
3422 case X86::XMM1: ShadowReg = X86::RDX; break;
3423 case X86::XMM2: ShadowReg = X86::R8; break;
3424 case X86::XMM3: ShadowReg = X86::R9; break;
3427 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3429 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3430 assert(VA.isMemLoc());
3431 if (!StackPtr.getNode())
3432 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3433 getPointerTy(DAG.getDataLayout()));
3434 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3435 dl, DAG, VA, Flags));
3439 if (!MemOpChains.empty())
3440 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3442 if (Subtarget.isPICStyleGOT()) {
3443 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3446 RegsToPass.push_back(std::make_pair(
3447 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3448 getPointerTy(DAG.getDataLayout()))));
3450 // If we are tail calling and generating PIC/GOT style code load the
3451 // address of the callee into ECX. The value in ecx is used as target of
3452 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3453 // for tail calls on PIC/GOT architectures. Normally we would just put the
3454 // address of GOT into ebx and then call target@PLT. But for tail calls
3455 // ebx would be restored (since ebx is callee saved) before jumping to the
3458 // Note: The actual moving to ECX is done further down.
3459 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3460 if (G && !G->getGlobal()->hasLocalLinkage() &&
3461 G->getGlobal()->hasDefaultVisibility())
3462 Callee = LowerGlobalAddress(Callee, DAG);
3463 else if (isa<ExternalSymbolSDNode>(Callee))
3464 Callee = LowerExternalSymbol(Callee, DAG);
3468 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3469 // From AMD64 ABI document:
3470 // For calls that may call functions that use varargs or stdargs
3471 // (prototype-less calls or calls to functions containing ellipsis (...) in
3472 // the declaration) %al is used as hidden argument to specify the number
3473 // of SSE registers used. The contents of %al do not need to match exactly
3474 // the number of registers, but must be an ubound on the number of SSE
3475 // registers used and is in the range 0 - 8 inclusive.
3477 // Count the number of XMM registers allocated.
3478 static const MCPhysReg XMMArgRegs[] = {
3479 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3480 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3482 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3483 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3484 && "SSE registers cannot be used when SSE is disabled");
3486 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3487 DAG.getConstant(NumXMMRegs, dl,
3491 if (isVarArg && IsMustTail) {
3492 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3493 for (const auto &F : Forwards) {
3494 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3495 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3499 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3500 // don't need this because the eligibility check rejects calls that require
3501 // shuffling arguments passed in memory.
3502 if (!IsSibcall && isTailCall) {
3503 // Force all the incoming stack arguments to be loaded from the stack
3504 // before any new outgoing arguments are stored to the stack, because the
3505 // outgoing stack slots may alias the incoming argument stack slots, and
3506 // the alias isn't otherwise explicit. This is slightly more conservative
3507 // than necessary, because it means that each store effectively depends
3508 // on every argument instead of just those arguments it would clobber.
3509 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3511 SmallVector<SDValue, 8> MemOpChains2;
3514 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3516 CCValAssign &VA = ArgLocs[I];
3518 if (VA.isRegLoc()) {
3519 if (VA.needsCustom()) {
3520 assert((CallConv == CallingConv::X86_RegCall) &&
3521 "Expecting custome case only in regcall calling convention");
3522 // This means that we are in special case where one argument was
3523 // passed through two register locations - Skip the next location
3530 assert(VA.isMemLoc());
3531 SDValue Arg = OutVals[OutsIndex];
3532 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3533 // Skip inalloca arguments. They don't require any work.
3534 if (Flags.isInAlloca())
3536 // Create frame index.
3537 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3538 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3539 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3540 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3542 if (Flags.isByVal()) {
3543 // Copy relative to framepointer.
3544 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3545 if (!StackPtr.getNode())
3546 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3547 getPointerTy(DAG.getDataLayout()));
3548 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3551 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3555 // Store relative to framepointer.
3556 MemOpChains2.push_back(DAG.getStore(
3557 ArgChain, dl, Arg, FIN,
3558 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3562 if (!MemOpChains2.empty())
3563 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3565 // Store the return address to the appropriate stack slot.
3566 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3567 getPointerTy(DAG.getDataLayout()),
3568 RegInfo->getSlotSize(), FPDiff, dl);
3571 // Build a sequence of copy-to-reg nodes chained together with token chain
3572 // and flag operands which copy the outgoing args into registers.
3574 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3575 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3576 RegsToPass[i].second, InFlag);
3577 InFlag = Chain.getValue(1);
3580 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3581 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3582 // In the 64-bit large code model, we have to make all calls
3583 // through a register, since the call instruction's 32-bit
3584 // pc-relative offset may not be large enough to hold the whole
3586 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3587 // If the callee is a GlobalAddress node (quite common, every direct call
3588 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3590 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3592 // We should use extra load for direct calls to dllimported functions in
3594 const GlobalValue *GV = G->getGlobal();
3595 if (!GV->hasDLLImportStorageClass()) {
3596 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3598 Callee = DAG.getTargetGlobalAddress(
3599 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3601 if (OpFlags == X86II::MO_GOTPCREL) {
3603 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3604 getPointerTy(DAG.getDataLayout()), Callee);
3605 // Add extra indirection
3606 Callee = DAG.getLoad(
3607 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3608 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3611 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3612 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3613 unsigned char OpFlags =
3614 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3616 Callee = DAG.getTargetExternalSymbol(
3617 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3618 } else if (Subtarget.isTarget64BitILP32() &&
3619 Callee->getValueType(0) == MVT::i32) {
3620 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3621 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3624 // Returns a chain & a flag for retval copy to use.
3625 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3626 SmallVector<SDValue, 8> Ops;
3628 if (!IsSibcall && isTailCall) {
3629 Chain = DAG.getCALLSEQ_END(Chain,
3630 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3631 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3632 InFlag = Chain.getValue(1);
3635 Ops.push_back(Chain);
3636 Ops.push_back(Callee);
3639 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3641 // Add argument registers to the end of the list so that they are known live
3643 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3644 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3645 RegsToPass[i].second.getValueType()));
3647 // Add a register mask operand representing the call-preserved registers.
3648 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3649 assert(Mask && "Missing call preserved mask for calling convention");
3651 // If this is an invoke in a 32-bit function using a funclet-based
3652 // personality, assume the function clobbers all registers. If an exception
3653 // is thrown, the runtime will not restore CSRs.
3654 // FIXME: Model this more precisely so that we can register allocate across
3655 // the normal edge and spill and fill across the exceptional edge.
3656 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3657 const Function *CallerFn = MF.getFunction();
3658 EHPersonality Pers =
3659 CallerFn->hasPersonalityFn()
3660 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3661 : EHPersonality::Unknown;
3662 if (isFuncletEHPersonality(Pers))
3663 Mask = RegInfo->getNoPreservedMask();
3666 Ops.push_back(DAG.getRegisterMask(Mask));
3668 if (InFlag.getNode())
3669 Ops.push_back(InFlag);
3673 //// If this is the first return lowered for this function, add the regs
3674 //// to the liveout set for the function.
3675 // This isn't right, although it's probably harmless on x86; liveouts
3676 // should be computed from returns not tail calls. Consider a void
3677 // function making a tail call to a function returning int.
3678 MF.getFrameInfo().setHasTailCall();
3679 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3682 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3683 InFlag = Chain.getValue(1);
3685 // Create the CALLSEQ_END node.
3686 unsigned NumBytesForCalleeToPop;
3687 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3688 DAG.getTarget().Options.GuaranteedTailCallOpt))
3689 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3690 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3691 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3692 SR == StackStructReturn)
3693 // If this is a call to a struct-return function, the callee
3694 // pops the hidden struct pointer, so we have to push it back.
3695 // This is common for Darwin/X86, Linux & Mingw32 targets.
3696 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3697 NumBytesForCalleeToPop = 4;
3699 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3701 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3702 // No need to reset the stack after the call if the call doesn't return. To
3703 // make the MI verify, we'll pretend the callee does it for us.
3704 NumBytesForCalleeToPop = NumBytes;
3707 // Returns a flag for retval copy to use.
3709 Chain = DAG.getCALLSEQ_END(Chain,
3710 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3711 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3714 InFlag = Chain.getValue(1);
3717 // Handle result values, copying them out of physregs into vregs that we
3719 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3720 Ins, dl, DAG, InVals);
3723 //===----------------------------------------------------------------------===//
3724 // Fast Calling Convention (tail call) implementation
3725 //===----------------------------------------------------------------------===//
3727 // Like std call, callee cleans arguments, convention except that ECX is
3728 // reserved for storing the tail called function address. Only 2 registers are
3729 // free for argument passing (inreg). Tail call optimization is performed
3731 // * tailcallopt is enabled
3732 // * caller/callee are fastcc
3733 // On X86_64 architecture with GOT-style position independent code only local
3734 // (within module) calls are supported at the moment.
3735 // To keep the stack aligned according to platform abi the function
3736 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3737 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3738 // If a tail called function callee has more arguments than the caller the
3739 // caller needs to make sure that there is room to move the RETADDR to. This is
3740 // achieved by reserving an area the size of the argument delta right after the
3741 // original RETADDR, but before the saved framepointer or the spilled registers
3742 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3754 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3757 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3758 SelectionDAG& DAG) const {
3759 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3760 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3761 unsigned StackAlignment = TFI.getStackAlignment();
3762 uint64_t AlignMask = StackAlignment - 1;
3763 int64_t Offset = StackSize;
3764 unsigned SlotSize = RegInfo->getSlotSize();
3765 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3766 // Number smaller than 12 so just add the difference.
3767 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3769 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3770 Offset = ((~AlignMask) & Offset) + StackAlignment +
3771 (StackAlignment-SlotSize);
3776 /// Return true if the given stack call argument is already available in the
3777 /// same position (relatively) of the caller's incoming argument stack.
3779 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3780 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3781 const X86InstrInfo *TII, const CCValAssign &VA) {
3782 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3785 // Look through nodes that don't alter the bits of the incoming value.
3786 unsigned Op = Arg.getOpcode();
3787 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3788 Arg = Arg.getOperand(0);
3791 if (Op == ISD::TRUNCATE) {
3792 const SDValue &TruncInput = Arg.getOperand(0);
3793 if (TruncInput.getOpcode() == ISD::AssertZext &&
3794 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3795 Arg.getValueType()) {
3796 Arg = TruncInput.getOperand(0);
3804 if (Arg.getOpcode() == ISD::CopyFromReg) {
3805 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3806 if (!TargetRegisterInfo::isVirtualRegister(VR))
3808 MachineInstr *Def = MRI->getVRegDef(VR);
3811 if (!Flags.isByVal()) {
3812 if (!TII->isLoadFromStackSlot(*Def, FI))
3815 unsigned Opcode = Def->getOpcode();
3816 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3817 Opcode == X86::LEA64_32r) &&
3818 Def->getOperand(1).isFI()) {
3819 FI = Def->getOperand(1).getIndex();
3820 Bytes = Flags.getByValSize();
3824 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3825 if (Flags.isByVal())
3826 // ByVal argument is passed in as a pointer but it's now being
3827 // dereferenced. e.g.
3828 // define @foo(%struct.X* %A) {
3829 // tail call @bar(%struct.X* byval %A)
3832 SDValue Ptr = Ld->getBasePtr();
3833 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3836 FI = FINode->getIndex();
3837 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3838 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3839 FI = FINode->getIndex();
3840 Bytes = Flags.getByValSize();
3844 assert(FI != INT_MAX);
3845 if (!MFI.isFixedObjectIndex(FI))
3848 if (Offset != MFI.getObjectOffset(FI))
3851 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3852 // If the argument location is wider than the argument type, check that any
3853 // extension flags match.
3854 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3855 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3860 return Bytes == MFI.getObjectSize(FI);
3863 /// Check whether the call is eligible for tail call optimization. Targets
3864 /// that want to do tail call optimization should implement this function.
3865 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3866 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3867 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3868 const SmallVectorImpl<ISD::OutputArg> &Outs,
3869 const SmallVectorImpl<SDValue> &OutVals,
3870 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3871 if (!mayTailCallThisCC(CalleeCC))
3874 // If -tailcallopt is specified, make fastcc functions tail-callable.
3875 MachineFunction &MF = DAG.getMachineFunction();
3876 const Function *CallerF = MF.getFunction();
3878 // If the function return type is x86_fp80 and the callee return type is not,
3879 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3880 // perform a tailcall optimization here.
3881 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3884 CallingConv::ID CallerCC = CallerF->getCallingConv();
3885 bool CCMatch = CallerCC == CalleeCC;
3886 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3887 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3889 // Win64 functions have extra shadow space for argument homing. Don't do the
3890 // sibcall if the caller and callee have mismatched expectations for this
3892 if (IsCalleeWin64 != IsCallerWin64)
3895 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3896 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3901 // Look for obvious safe cases to perform tail call optimization that do not
3902 // require ABI changes. This is what gcc calls sibcall.
3904 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3905 // emit a special epilogue.
3906 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3907 if (RegInfo->needsStackRealignment(MF))
3910 // Also avoid sibcall optimization if either caller or callee uses struct
3911 // return semantics.
3912 if (isCalleeStructRet || isCallerStructRet)
3915 // Do not sibcall optimize vararg calls unless all arguments are passed via
3917 LLVMContext &C = *DAG.getContext();
3918 if (isVarArg && !Outs.empty()) {
3919 // Optimizing for varargs on Win64 is unlikely to be safe without
3920 // additional testing.
3921 if (IsCalleeWin64 || IsCallerWin64)
3924 SmallVector<CCValAssign, 16> ArgLocs;
3925 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3927 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3928 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3929 if (!ArgLocs[i].isRegLoc())
3933 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3934 // stack. Therefore, if it's not used by the call it is not safe to optimize
3935 // this into a sibcall.
3936 bool Unused = false;
3937 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3944 SmallVector<CCValAssign, 16> RVLocs;
3945 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3946 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3947 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3948 CCValAssign &VA = RVLocs[i];
3949 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3954 // Check that the call results are passed in the same way.
3955 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3956 RetCC_X86, RetCC_X86))
3958 // The callee has to preserve all registers the caller needs to preserve.
3959 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3960 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3962 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3963 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3967 unsigned StackArgsSize = 0;
3969 // If the callee takes no arguments then go on to check the results of the
3971 if (!Outs.empty()) {
3972 // Check if stack adjustment is needed. For now, do not do this if any
3973 // argument is passed on the stack.
3974 SmallVector<CCValAssign, 16> ArgLocs;
3975 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3977 // Allocate shadow area for Win64
3979 CCInfo.AllocateStack(32, 8);
3981 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3982 StackArgsSize = CCInfo.getNextStackOffset();
3984 if (CCInfo.getNextStackOffset()) {
3985 // Check if the arguments are already laid out in the right way as
3986 // the caller's fixed stack objects.
3987 MachineFrameInfo &MFI = MF.getFrameInfo();
3988 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3989 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3990 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3991 CCValAssign &VA = ArgLocs[i];
3992 SDValue Arg = OutVals[i];
3993 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3994 if (VA.getLocInfo() == CCValAssign::Indirect)
3996 if (!VA.isRegLoc()) {
3997 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4004 bool PositionIndependent = isPositionIndependent();
4005 // If the tailcall address may be in a register, then make sure it's
4006 // possible to register allocate for it. In 32-bit, the call address can
4007 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4008 // callee-saved registers are restored. These happen to be the same
4009 // registers used to pass 'inreg' arguments so watch out for those.
4010 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4011 !isa<ExternalSymbolSDNode>(Callee)) ||
4012 PositionIndependent)) {
4013 unsigned NumInRegs = 0;
4014 // In PIC we need an extra register to formulate the address computation
4016 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4018 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4019 CCValAssign &VA = ArgLocs[i];
4022 unsigned Reg = VA.getLocReg();
4025 case X86::EAX: case X86::EDX: case X86::ECX:
4026 if (++NumInRegs == MaxInRegs)
4033 const MachineRegisterInfo &MRI = MF.getRegInfo();
4034 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4038 bool CalleeWillPop =
4039 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4040 MF.getTarget().Options.GuaranteedTailCallOpt);
4042 if (unsigned BytesToPop =
4043 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4044 // If we have bytes to pop, the callee must pop them.
4045 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4046 if (!CalleePopMatches)
4048 } else if (CalleeWillPop && StackArgsSize > 0) {
4049 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4057 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4058 const TargetLibraryInfo *libInfo) const {
4059 return X86::createFastISel(funcInfo, libInfo);
4062 //===----------------------------------------------------------------------===//
4063 // Other Lowering Hooks
4064 //===----------------------------------------------------------------------===//
4066 static bool MayFoldLoad(SDValue Op) {
4067 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4070 static bool MayFoldIntoStore(SDValue Op) {
4071 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4074 static bool MayFoldIntoZeroExtend(SDValue Op) {
4075 if (Op.hasOneUse()) {
4076 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4077 return (ISD::ZERO_EXTEND == Opcode);
4082 static bool isTargetShuffle(unsigned Opcode) {
4084 default: return false;
4085 case X86ISD::BLENDI:
4086 case X86ISD::PSHUFB:
4087 case X86ISD::PSHUFD:
4088 case X86ISD::PSHUFHW:
4089 case X86ISD::PSHUFLW:
4091 case X86ISD::INSERTPS:
4092 case X86ISD::PALIGNR:
4093 case X86ISD::VSHLDQ:
4094 case X86ISD::VSRLDQ:
4095 case X86ISD::MOVLHPS:
4096 case X86ISD::MOVLHPD:
4097 case X86ISD::MOVHLPS:
4098 case X86ISD::MOVLPS:
4099 case X86ISD::MOVLPD:
4100 case X86ISD::MOVSHDUP:
4101 case X86ISD::MOVSLDUP:
4102 case X86ISD::MOVDDUP:
4105 case X86ISD::UNPCKL:
4106 case X86ISD::UNPCKH:
4107 case X86ISD::VBROADCAST:
4108 case X86ISD::VPERMILPI:
4109 case X86ISD::VPERMILPV:
4110 case X86ISD::VPERM2X128:
4111 case X86ISD::VPERMIL2:
4112 case X86ISD::VPERMI:
4113 case X86ISD::VPPERM:
4114 case X86ISD::VPERMV:
4115 case X86ISD::VPERMV3:
4116 case X86ISD::VPERMIV3:
4117 case X86ISD::VZEXT_MOVL:
4122 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4124 default: return false;
4126 case X86ISD::PSHUFB:
4127 case X86ISD::VPERMILPV:
4128 case X86ISD::VPERMIL2:
4129 case X86ISD::VPPERM:
4130 case X86ISD::VPERMV:
4131 case X86ISD::VPERMV3:
4132 case X86ISD::VPERMIV3:
4134 // 'Faux' Target Shuffles.
4140 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4141 MachineFunction &MF = DAG.getMachineFunction();
4142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4143 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4144 int ReturnAddrIndex = FuncInfo->getRAIndex();
4146 if (ReturnAddrIndex == 0) {
4147 // Set up a frame object for the return address.
4148 unsigned SlotSize = RegInfo->getSlotSize();
4149 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4152 FuncInfo->setRAIndex(ReturnAddrIndex);
4155 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4158 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4159 bool hasSymbolicDisplacement) {
4160 // Offset should fit into 32 bit immediate field.
4161 if (!isInt<32>(Offset))
4164 // If we don't have a symbolic displacement - we don't have any extra
4166 if (!hasSymbolicDisplacement)
4169 // FIXME: Some tweaks might be needed for medium code model.
4170 if (M != CodeModel::Small && M != CodeModel::Kernel)
4173 // For small code model we assume that latest object is 16MB before end of 31
4174 // bits boundary. We may also accept pretty large negative constants knowing
4175 // that all objects are in the positive half of address space.
4176 if (M == CodeModel::Small && Offset < 16*1024*1024)
4179 // For kernel code model we know that all object resist in the negative half
4180 // of 32bits address space. We may not accept negative offsets, since they may
4181 // be just off and we may accept pretty large positive ones.
4182 if (M == CodeModel::Kernel && Offset >= 0)
4188 /// Determines whether the callee is required to pop its own arguments.
4189 /// Callee pop is necessary to support tail calls.
4190 bool X86::isCalleePop(CallingConv::ID CallingConv,
4191 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4192 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4193 // can guarantee TCO.
4194 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4197 switch (CallingConv) {
4200 case CallingConv::X86_StdCall:
4201 case CallingConv::X86_FastCall:
4202 case CallingConv::X86_ThisCall:
4203 case CallingConv::X86_VectorCall:
4208 /// \brief Return true if the condition is an unsigned comparison operation.
4209 static bool isX86CCUnsigned(unsigned X86CC) {
4212 llvm_unreachable("Invalid integer condition!");
4228 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4229 switch (SetCCOpcode) {
4230 default: llvm_unreachable("Invalid integer condition!");
4231 case ISD::SETEQ: return X86::COND_E;
4232 case ISD::SETGT: return X86::COND_G;
4233 case ISD::SETGE: return X86::COND_GE;
4234 case ISD::SETLT: return X86::COND_L;
4235 case ISD::SETLE: return X86::COND_LE;
4236 case ISD::SETNE: return X86::COND_NE;
4237 case ISD::SETULT: return X86::COND_B;
4238 case ISD::SETUGT: return X86::COND_A;
4239 case ISD::SETULE: return X86::COND_BE;
4240 case ISD::SETUGE: return X86::COND_AE;
4244 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4245 /// condition code, returning the condition code and the LHS/RHS of the
4246 /// comparison to make.
4247 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4248 bool isFP, SDValue &LHS, SDValue &RHS,
4249 SelectionDAG &DAG) {
4251 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4252 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4253 // X > -1 -> X == 0, jump !sign.
4254 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4255 return X86::COND_NS;
4257 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4258 // X < 0 -> X == 0, jump on sign.
4261 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4263 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4264 return X86::COND_LE;
4268 return TranslateIntegerX86CC(SetCCOpcode);
4271 // First determine if it is required or is profitable to flip the operands.
4273 // If LHS is a foldable load, but RHS is not, flip the condition.
4274 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4275 !ISD::isNON_EXTLoad(RHS.getNode())) {
4276 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4277 std::swap(LHS, RHS);
4280 switch (SetCCOpcode) {
4286 std::swap(LHS, RHS);
4290 // On a floating point condition, the flags are set as follows:
4292 // 0 | 0 | 0 | X > Y
4293 // 0 | 0 | 1 | X < Y
4294 // 1 | 0 | 0 | X == Y
4295 // 1 | 1 | 1 | unordered
4296 switch (SetCCOpcode) {
4297 default: llvm_unreachable("Condcode should be pre-legalized away");
4299 case ISD::SETEQ: return X86::COND_E;
4300 case ISD::SETOLT: // flipped
4302 case ISD::SETGT: return X86::COND_A;
4303 case ISD::SETOLE: // flipped
4305 case ISD::SETGE: return X86::COND_AE;
4306 case ISD::SETUGT: // flipped
4308 case ISD::SETLT: return X86::COND_B;
4309 case ISD::SETUGE: // flipped
4311 case ISD::SETLE: return X86::COND_BE;
4313 case ISD::SETNE: return X86::COND_NE;
4314 case ISD::SETUO: return X86::COND_P;
4315 case ISD::SETO: return X86::COND_NP;
4317 case ISD::SETUNE: return X86::COND_INVALID;
4321 /// Is there a floating point cmov for the specific X86 condition code?
4322 /// Current x86 isa includes the following FP cmov instructions:
4323 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4324 static bool hasFPCMov(unsigned X86CC) {
4341 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4343 unsigned Intrinsic) const {
4345 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4349 Info.opc = ISD::INTRINSIC_W_CHAIN;
4350 Info.readMem = false;
4351 Info.writeMem = false;
4355 switch (IntrData->Type) {
4356 case EXPAND_FROM_MEM: {
4357 Info.ptrVal = I.getArgOperand(0);
4358 Info.memVT = MVT::getVT(I.getType());
4360 Info.readMem = true;
4363 case COMPRESS_TO_MEM: {
4364 Info.ptrVal = I.getArgOperand(0);
4365 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4367 Info.writeMem = true;
4370 case TRUNCATE_TO_MEM_VI8:
4371 case TRUNCATE_TO_MEM_VI16:
4372 case TRUNCATE_TO_MEM_VI32: {
4373 Info.ptrVal = I.getArgOperand(0);
4374 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4375 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4376 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4378 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4379 ScalarVT = MVT::i16;
4380 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4381 ScalarVT = MVT::i32;
4383 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4385 Info.writeMem = true;
4395 /// Returns true if the target can instruction select the
4396 /// specified FP immediate natively. If false, the legalizer will
4397 /// materialize the FP immediate as a load from a constant pool.
4398 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4399 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4400 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4406 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4407 ISD::LoadExtType ExtTy,
4409 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4410 // relocation target a movq or addq instruction: don't let the load shrink.
4411 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4412 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4413 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4414 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4418 /// \brief Returns true if it is beneficial to convert a load of a constant
4419 /// to just the constant itself.
4420 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4422 assert(Ty->isIntegerTy());
4424 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4425 if (BitSize == 0 || BitSize > 64)
4430 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4431 unsigned Index) const {
4432 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4435 return (Index == 0 || Index == ResVT.getVectorNumElements());
4438 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4439 // Speculate cttz only if we can directly use TZCNT.
4440 return Subtarget.hasBMI();
4443 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4444 // Speculate ctlz only if we can directly use LZCNT.
4445 return Subtarget.hasLZCNT();
4448 bool X86TargetLowering::isCtlzFast() const {
4449 return Subtarget.hasFastLZCNT();
4452 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4453 if (!Subtarget.hasBMI())
4456 // There are only 32-bit and 64-bit forms for 'andn'.
4457 EVT VT = Y.getValueType();
4458 if (VT != MVT::i32 && VT != MVT::i64)
4464 /// Val is the undef sentinel value or equal to the specified value.
4465 static bool isUndefOrEqual(int Val, int CmpVal) {
4466 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4469 /// Val is either the undef or zero sentinel value.
4470 static bool isUndefOrZero(int Val) {
4471 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4474 /// Return true if every element in Mask, beginning
4475 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4476 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4477 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4478 if (Mask[i] != SM_SentinelUndef)
4483 /// Return true if Val is undef or if its value falls within the
4484 /// specified range (L, H].
4485 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4486 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4489 /// Return true if every element in Mask is undef or if its value
4490 /// falls within the specified range (L, H].
4491 static bool isUndefOrInRange(ArrayRef<int> Mask,
4494 if (!isUndefOrInRange(M, Low, Hi))
4499 /// Return true if Val is undef, zero or if its value falls within the
4500 /// specified range (L, H].
4501 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4502 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4505 /// Return true if every element in Mask is undef, zero or if its value
4506 /// falls within the specified range (L, H].
4507 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4509 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4514 /// Return true if every element in Mask, beginning
4515 /// from position Pos and ending in Pos+Size, falls within the specified
4516 /// sequential range (Low, Low+Size]. or is undef.
4517 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4518 unsigned Pos, unsigned Size, int Low) {
4519 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4520 if (!isUndefOrEqual(Mask[i], Low))
4525 /// Return true if every element in Mask, beginning
4526 /// from position Pos and ending in Pos+Size, falls within the specified
4527 /// sequential range (Low, Low+Size], or is undef or is zero.
4528 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4529 unsigned Size, int Low) {
4530 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4531 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4536 /// Return true if every element in Mask, beginning
4537 /// from position Pos and ending in Pos+Size is undef or is zero.
4538 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4540 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4541 if (!isUndefOrZero(Mask[i]))
4546 /// \brief Helper function to test whether a shuffle mask could be
4547 /// simplified by widening the elements being shuffled.
4549 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4550 /// leaves it in an unspecified state.
4552 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4553 /// shuffle masks. The latter have the special property of a '-2' representing
4554 /// a zero-ed lane of a vector.
4555 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4556 SmallVectorImpl<int> &WidenedMask) {
4557 WidenedMask.assign(Mask.size() / 2, 0);
4558 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4559 // If both elements are undef, its trivial.
4560 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
4561 WidenedMask[i / 2] = SM_SentinelUndef;
4565 // Check for an undef mask and a mask value properly aligned to fit with
4566 // a pair of values. If we find such a case, use the non-undef mask's value.
4567 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
4568 Mask[i + 1] % 2 == 1) {
4569 WidenedMask[i / 2] = Mask[i + 1] / 2;
4572 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
4573 WidenedMask[i / 2] = Mask[i] / 2;
4577 // When zeroing, we need to spread the zeroing across both lanes to widen.
4578 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
4579 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
4580 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
4581 WidenedMask[i / 2] = SM_SentinelZero;
4587 // Finally check if the two mask values are adjacent and aligned with
4589 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
4590 Mask[i] + 1 == Mask[i + 1]) {
4591 WidenedMask[i / 2] = Mask[i] / 2;
4595 // Otherwise we can't safely widen the elements used in this shuffle.
4598 assert(WidenedMask.size() == Mask.size() / 2 &&
4599 "Incorrect size of mask after widening the elements!");
4604 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4605 /// mask index with the scaled sequential indices for an equivalent narrowed
4606 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4608 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4609 SmallVectorImpl<int> &ScaledMask) {
4610 assert(0 < Scale && "Unexpected scaling factor");
4611 int NumElts = Mask.size();
4612 ScaledMask.assign(NumElts * Scale, -1);
4614 for (int i = 0; i != NumElts; ++i) {
4617 // Repeat sentinel values in every mask element.
4619 for (int s = 0; s != Scale; ++s)
4620 ScaledMask[(Scale * i) + s] = M;
4624 // Scale mask element and increment across each mask element.
4625 for (int s = 0; s != Scale; ++s)
4626 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4630 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4631 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4632 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4633 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4634 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4637 // The index should be aligned on a vecWidth-bit boundary.
4639 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4641 MVT VT = N->getSimpleValueType(0);
4642 unsigned ElSize = VT.getScalarSizeInBits();
4643 bool Result = (Index * ElSize) % vecWidth == 0;
4648 /// Return true if the specified INSERT_SUBVECTOR
4649 /// operand specifies a subvector insert that is suitable for input to
4650 /// insertion of 128 or 256-bit subvectors
4651 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4652 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4653 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4655 // The index should be aligned on a vecWidth-bit boundary.
4657 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4659 MVT VT = N->getSimpleValueType(0);
4660 unsigned ElSize = VT.getScalarSizeInBits();
4661 bool Result = (Index * ElSize) % vecWidth == 0;
4666 bool X86::isVINSERT128Index(SDNode *N) {
4667 return isVINSERTIndex(N, 128);
4670 bool X86::isVINSERT256Index(SDNode *N) {
4671 return isVINSERTIndex(N, 256);
4674 bool X86::isVEXTRACT128Index(SDNode *N) {
4675 return isVEXTRACTIndex(N, 128);
4678 bool X86::isVEXTRACT256Index(SDNode *N) {
4679 return isVEXTRACTIndex(N, 256);
4682 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4683 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4684 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4685 "Illegal extract subvector for VEXTRACT");
4688 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4690 MVT VecVT = N->getOperand(0).getSimpleValueType();
4691 MVT ElVT = VecVT.getVectorElementType();
4693 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4694 return Index / NumElemsPerChunk;
4697 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4698 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4699 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4700 "Illegal insert subvector for VINSERT");
4703 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4705 MVT VecVT = N->getSimpleValueType(0);
4706 MVT ElVT = VecVT.getVectorElementType();
4708 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4709 return Index / NumElemsPerChunk;
4712 /// Return the appropriate immediate to extract the specified
4713 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4714 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4715 return getExtractVEXTRACTImmediate(N, 128);
4718 /// Return the appropriate immediate to extract the specified
4719 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4720 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4721 return getExtractVEXTRACTImmediate(N, 256);
4724 /// Return the appropriate immediate to insert at the specified
4725 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4726 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4727 return getInsertVINSERTImmediate(N, 128);
4730 /// Return the appropriate immediate to insert at the specified
4731 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4732 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4733 return getInsertVINSERTImmediate(N, 256);
4736 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4737 bool X86::isZeroNode(SDValue Elt) {
4738 return isNullConstant(Elt) || isNullFPConstant(Elt);
4741 // Build a vector of constants
4742 // Use an UNDEF node if MaskElt == -1.
4743 // Spilt 64-bit constants in the 32-bit mode.
4744 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4745 const SDLoc &dl, bool IsMask = false) {
4747 SmallVector<SDValue, 32> Ops;
4750 MVT ConstVecVT = VT;
4751 unsigned NumElts = VT.getVectorNumElements();
4752 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4753 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4754 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4758 MVT EltVT = ConstVecVT.getVectorElementType();
4759 for (unsigned i = 0; i < NumElts; ++i) {
4760 bool IsUndef = Values[i] < 0 && IsMask;
4761 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4762 DAG.getConstant(Values[i], dl, EltVT);
4763 Ops.push_back(OpNode);
4765 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4766 DAG.getConstant(0, dl, EltVT));
4768 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4770 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4774 static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
4775 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4776 assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
4777 SmallVector<SDValue, 32> Ops;
4780 MVT ConstVecVT = VT;
4781 unsigned NumElts = VT.getVectorNumElements();
4782 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4783 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4784 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4788 MVT EltVT = ConstVecVT.getVectorElementType();
4789 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4791 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4794 const APInt &V = Bits[i];
4795 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4797 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4798 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4799 } else if (EltVT == MVT::f32) {
4800 APFloat FV(APFloat::IEEEsingle(), V);
4801 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4802 } else if (EltVT == MVT::f64) {
4803 APFloat FV(APFloat::IEEEdouble(), V);
4804 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4806 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4810 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4811 return DAG.getBitcast(VT, ConstsNode);
4814 /// Returns a vector of specified type with all zero elements.
4815 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4816 SelectionDAG &DAG, const SDLoc &dl) {
4817 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4818 VT.getVectorElementType() == MVT::i1) &&
4819 "Unexpected vector type");
4821 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4822 // type. This ensures they get CSE'd. But if the integer type is not
4823 // available, use a floating-point +0.0 instead.
4825 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4826 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4827 } else if (VT.getVectorElementType() == MVT::i1) {
4828 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4829 "Unexpected vector type");
4830 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4831 "Unexpected vector type");
4832 Vec = DAG.getConstant(0, dl, VT);
4834 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4835 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4837 return DAG.getBitcast(VT, Vec);
4840 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4841 const SDLoc &dl, unsigned vectorWidth) {
4842 EVT VT = Vec.getValueType();
4843 EVT ElVT = VT.getVectorElementType();
4844 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4845 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4846 VT.getVectorNumElements()/Factor);
4848 // Extract from UNDEF is UNDEF.
4850 return DAG.getUNDEF(ResultVT);
4852 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4853 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4854 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4856 // This is the index of the first element of the vectorWidth-bit chunk
4857 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4858 IdxVal &= ~(ElemsPerChunk - 1);
4860 // If the input is a buildvector just emit a smaller one.
4861 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4862 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
4863 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4865 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4866 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4869 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4870 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4871 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4872 /// instructions or a simple subregister reference. Idx is an index in the
4873 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4874 /// lowering EXTRACT_VECTOR_ELT operations easier.
4875 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4876 SelectionDAG &DAG, const SDLoc &dl) {
4877 assert((Vec.getValueType().is256BitVector() ||
4878 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4879 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4882 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4883 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4884 SelectionDAG &DAG, const SDLoc &dl) {
4885 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4886 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4889 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4890 SelectionDAG &DAG, const SDLoc &dl,
4891 unsigned vectorWidth) {
4892 assert((vectorWidth == 128 || vectorWidth == 256) &&
4893 "Unsupported vector width");
4894 // Inserting UNDEF is Result
4897 EVT VT = Vec.getValueType();
4898 EVT ElVT = VT.getVectorElementType();
4899 EVT ResultVT = Result.getValueType();
4901 // Insert the relevant vectorWidth bits.
4902 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4903 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4905 // This is the index of the first element of the vectorWidth-bit chunk
4906 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4907 IdxVal &= ~(ElemsPerChunk - 1);
4909 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4910 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4913 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4914 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4915 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4916 /// simple superregister reference. Idx is an index in the 128 bits
4917 /// we want. It need not be aligned to a 128-bit boundary. That makes
4918 /// lowering INSERT_VECTOR_ELT operations easier.
4919 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4920 SelectionDAG &DAG, const SDLoc &dl) {
4921 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4923 // For insertion into the zero index (low half) of a 256-bit vector, it is
4924 // more efficient to generate a blend with immediate instead of an insert*128.
4925 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4926 // extend the subvector to the size of the result vector. Make sure that
4927 // we are not recursing on that node by checking for undef here.
4928 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4929 !Result.isUndef()) {
4930 EVT ResultVT = Result.getValueType();
4931 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4932 SDValue Undef = DAG.getUNDEF(ResultVT);
4933 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4936 // The blend instruction, and therefore its mask, depend on the data type.
4937 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4938 if (ScalarType.isFloatingPoint()) {
4939 // Choose either vblendps (float) or vblendpd (double).
4940 unsigned ScalarSize = ScalarType.getSizeInBits();
4941 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4942 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4943 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4944 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4947 const X86Subtarget &Subtarget =
4948 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4950 // AVX2 is needed for 256-bit integer blend support.
4951 // Integers must be cast to 32-bit because there is only vpblendd;
4952 // vpblendw can't be used for this because it has a handicapped mask.
4954 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4955 // is still more efficient than using the wrong domain vinsertf128 that
4956 // will be created by InsertSubVector().
4957 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4959 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4960 Result = DAG.getBitcast(CastVT, Result);
4961 Vec256 = DAG.getBitcast(CastVT, Vec256);
4962 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4963 return DAG.getBitcast(ResultVT, Vec256);
4966 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4969 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4970 SelectionDAG &DAG, const SDLoc &dl) {
4971 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4972 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4975 /// Insert i1-subvector to i1-vector.
4976 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4977 const X86Subtarget &Subtarget) {
4980 SDValue Vec = Op.getOperand(0);
4981 SDValue SubVec = Op.getOperand(1);
4982 SDValue Idx = Op.getOperand(2);
4984 if (!isa<ConstantSDNode>(Idx))
4987 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4988 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4991 MVT OpVT = Op.getSimpleValueType();
4992 MVT SubVecVT = SubVec.getSimpleValueType();
4993 unsigned NumElems = OpVT.getVectorNumElements();
4994 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4996 assert(IdxVal + SubVecNumElems <= NumElems &&
4997 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4998 "Unexpected index value in INSERT_SUBVECTOR");
5000 // There are 3 possible cases:
5001 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5002 // 2. Subvector should be inserted in the upper part
5003 // (IdxVal + SubVecNumElems == NumElems)
5004 // 3. Subvector should be inserted in the middle (for example v2i1
5005 // to v16i1, index 2)
5007 // extend to natively supported kshift
5008 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5009 MVT WideOpVT = OpVT;
5010 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5013 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5014 SDValue Undef = DAG.getUNDEF(WideOpVT);
5015 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5016 Undef, SubVec, ZeroIdx);
5018 // Extract sub-vector if require.
5019 auto ExtractSubVec = [&](SDValue V) {
5020 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5024 if (Vec.isUndef()) {
5026 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5027 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
5029 return ExtractSubVec(WideSubVec);
5032 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5033 NumElems = WideOpVT.getVectorNumElements();
5034 unsigned ShiftLeft = NumElems - SubVecNumElems;
5035 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5036 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5037 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5038 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
5039 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5040 return ExtractSubVec(Vec);
5044 // Zero lower bits of the Vec
5045 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5046 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5047 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5048 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5049 // Merge them together, SubVec should be zero extended.
5050 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5051 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5053 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5054 return ExtractSubVec(Vec);
5057 // Simple case when we put subvector in the upper part
5058 if (IdxVal + SubVecNumElems == NumElems) {
5059 // Zero upper bits of the Vec
5060 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5061 DAG.getConstant(IdxVal, dl, MVT::i8));
5062 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5063 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5064 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5065 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5066 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5067 return ExtractSubVec(Vec);
5069 // Subvector should be inserted in the middle - use shuffle
5070 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5072 SmallVector<int, 64> Mask;
5073 for (unsigned i = 0; i < NumElems; ++i)
5074 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5076 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5079 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5080 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5081 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5082 /// large BUILD_VECTORS.
5083 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5084 unsigned NumElems, SelectionDAG &DAG,
5086 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5087 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5090 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5091 unsigned NumElems, SelectionDAG &DAG,
5093 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5094 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5097 /// Returns a vector of specified type with all bits set.
5098 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5099 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
5100 /// Then bitcast to their original type, ensuring they get CSE'd.
5101 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
5102 SelectionDAG &DAG, const SDLoc &dl) {
5103 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5104 "Expected a 128/256/512-bit vector type");
5106 APInt Ones = APInt::getAllOnesValue(32);
5107 unsigned NumElts = VT.getSizeInBits() / 32;
5109 if (!Subtarget.hasInt256() && NumElts == 8) {
5110 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
5111 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5113 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5115 return DAG.getBitcast(VT, Vec);
5118 /// Generate unpacklo/unpackhi shuffle mask.
5119 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5121 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5122 int NumElts = VT.getVectorNumElements();
5123 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5125 for (int i = 0; i < NumElts; ++i) {
5126 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5127 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5128 Pos += (Unary ? 0 : NumElts * (i % 2));
5129 Pos += (Lo ? 0 : NumEltsInLane / 2);
5130 Mask.push_back(Pos);
5134 /// Returns a vector_shuffle node for an unpackl operation.
5135 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5136 SDValue V1, SDValue V2) {
5137 SmallVector<int, 8> Mask;
5138 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5139 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5142 /// Returns a vector_shuffle node for an unpackh operation.
5143 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5144 SDValue V1, SDValue V2) {
5145 SmallVector<int, 8> Mask;
5146 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5147 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5150 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5151 /// This produces a shuffle where the low element of V2 is swizzled into the
5152 /// zero/undef vector, landing at element Idx.
5153 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5154 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5156 const X86Subtarget &Subtarget,
5157 SelectionDAG &DAG) {
5158 MVT VT = V2.getSimpleValueType();
5160 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5161 int NumElems = VT.getVectorNumElements();
5162 SmallVector<int, 16> MaskVec(NumElems);
5163 for (int i = 0; i != NumElems; ++i)
5164 // If this is the insertion idx, put the low elt of V2 here.
5165 MaskVec[i] = (i == Idx) ? NumElems : i;
5166 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5169 static SDValue peekThroughBitcasts(SDValue V) {
5170 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5171 V = V.getOperand(0);
5175 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5176 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5177 V.getOperand(0).hasOneUse())
5178 V = V.getOperand(0);
5182 static const Constant *getTargetConstantFromNode(SDValue Op) {
5183 Op = peekThroughBitcasts(Op);
5185 auto *Load = dyn_cast<LoadSDNode>(Op);
5189 SDValue Ptr = Load->getBasePtr();
5190 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5191 Ptr->getOpcode() == X86ISD::WrapperRIP)
5192 Ptr = Ptr->getOperand(0);
5194 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5195 if (!CNode || CNode->isMachineConstantPoolEntry())
5198 return dyn_cast<Constant>(CNode->getConstVal());
5201 // Extract raw constant bits from constant pools.
5202 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5203 SmallBitVector &UndefElts,
5204 SmallVectorImpl<APInt> &EltBits) {
5205 assert(UndefElts.empty() && "Expected an empty UndefElts vector");
5206 assert(EltBits.empty() && "Expected an empty EltBits vector");
5208 Op = peekThroughBitcasts(Op);
5210 EVT VT = Op.getValueType();
5211 unsigned SizeInBits = VT.getSizeInBits();
5212 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5213 unsigned NumElts = SizeInBits / EltSizeInBits;
5215 // Extract all the undef/constant element data and pack into single bitsets.
5216 APInt UndefBits(SizeInBits, 0);
5217 APInt MaskBits(SizeInBits, 0);
5219 // Split the undef/constant single bitset data into the target elements.
5220 auto SplitBitData = [&]() {
5221 UndefElts = SmallBitVector(NumElts, false);
5222 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5224 for (unsigned i = 0; i != NumElts; ++i) {
5225 APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
5226 UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
5228 // Only treat an element as UNDEF if all bits are UNDEF, otherwise
5229 // treat it as zero.
5230 if (UndefEltBits.isAllOnesValue()) {
5231 UndefElts[i] = true;
5235 APInt Bits = MaskBits.lshr(i * EltSizeInBits);
5236 Bits = Bits.zextOrTrunc(EltSizeInBits);
5237 EltBits[i] = Bits.getZExtValue();
5242 auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
5246 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5247 if (isa<UndefValue>(Cst)) {
5248 Mask = APInt::getNullValue(SizeInBits);
5249 Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
5252 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5253 Mask = CInt->getValue().zextOrTrunc(SizeInBits);
5254 Undefs = APInt::getNullValue(SizeInBits);
5257 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5258 Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
5259 Undefs = APInt::getNullValue(SizeInBits);
5265 // Extract constant bits from constant pool vector.
5266 if (auto *Cst = getTargetConstantFromNode(Op)) {
5267 Type *CstTy = Cst->getType();
5268 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5271 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5272 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
5274 if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
5276 MaskBits |= Bits.shl(i * CstEltSizeInBits);
5277 UndefBits |= Undefs.shl(i * CstEltSizeInBits);
5280 return SplitBitData();
5283 // Extract constant bits from a broadcasted constant pool scalar.
5284 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5285 EltSizeInBits <= Op.getScalarValueSizeInBits()) {
5286 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5288 if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
5289 unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
5290 unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
5291 for (unsigned i = 0; i != NumBroadcastElts; ++i) {
5292 MaskBits |= Bits.shl(i * NumBroadcastBits);
5293 UndefBits |= Undefs.shl(i * NumBroadcastBits);
5295 return SplitBitData();
5303 // TODO: Merge more of this with getTargetConstantBitsFromNode.
5304 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5305 unsigned MaskEltSizeInBits,
5306 SmallVectorImpl<uint64_t> &RawMask) {
5307 MaskNode = peekThroughBitcasts(MaskNode);
5309 MVT VT = MaskNode.getSimpleValueType();
5310 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
5311 unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
5313 // Split an APInt element into MaskEltSizeInBits sized pieces and
5314 // insert into the shuffle mask.
5315 auto SplitElementToMask = [&](APInt Element) {
5316 // Note that this is x86 and so always little endian: the low byte is
5317 // the first byte of the mask.
5318 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5319 for (int i = 0; i < Split; ++i) {
5320 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
5321 Element = Element.lshr(MaskEltSizeInBits);
5322 RawMask.push_back(RawElt.getZExtValue());
5326 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
5327 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5328 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
5329 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
5331 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
5332 const APInt &MaskElement = CN->getAPIntValue();
5333 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
5334 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
5335 RawMask.push_back(RawElt.getZExtValue());
5341 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
5342 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
5343 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
5344 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
5345 if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
5346 RawMask.push_back(CN->getZExtValue());
5347 RawMask.append(NumMaskElts - 1, 0);
5351 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
5352 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5353 SplitElementToMask(CN->getAPIntValue());
5354 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
5361 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
5364 // We can always decode if the buildvector is all zero constants,
5365 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
5366 if (all_of(MaskNode->ops(), X86::isZeroNode)) {
5367 RawMask.append(NumMaskElts, 0);
5371 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5372 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
5375 for (SDValue Op : MaskNode->ops()) {
5376 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
5377 SplitElementToMask(CN->getAPIntValue());
5378 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
5379 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
5387 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5388 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5389 /// operands in \p Ops, and returns true.
5390 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5391 /// IsUnary for shuffles which use a single input multiple times, and in those
5392 /// cases it will adjust the mask to only have indices within that single input.
5393 /// It is an error to call this with non-empty Mask/Ops vectors.
5394 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5395 SmallVectorImpl<SDValue> &Ops,
5396 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5397 unsigned NumElems = VT.getVectorNumElements();
5400 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5401 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5404 bool IsFakeUnary = false;
5405 switch(N->getOpcode()) {
5406 case X86ISD::BLENDI:
5407 ImmN = N->getOperand(N->getNumOperands()-1);
5408 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411 ImmN = N->getOperand(N->getNumOperands()-1);
5412 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5413 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5415 case X86ISD::INSERTPS:
5416 ImmN = N->getOperand(N->getNumOperands()-1);
5417 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5418 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5420 case X86ISD::UNPCKH:
5421 DecodeUNPCKHMask(VT, Mask);
5422 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5424 case X86ISD::UNPCKL:
5425 DecodeUNPCKLMask(VT, Mask);
5426 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5428 case X86ISD::MOVHLPS:
5429 DecodeMOVHLPSMask(NumElems, Mask);
5430 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5432 case X86ISD::MOVLHPS:
5433 DecodeMOVLHPSMask(NumElems, Mask);
5434 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5436 case X86ISD::PALIGNR:
5437 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5438 ImmN = N->getOperand(N->getNumOperands()-1);
5439 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5440 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5441 Ops.push_back(N->getOperand(1));
5442 Ops.push_back(N->getOperand(0));
5444 case X86ISD::VSHLDQ:
5445 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5446 ImmN = N->getOperand(N->getNumOperands() - 1);
5447 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5450 case X86ISD::VSRLDQ:
5451 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5452 ImmN = N->getOperand(N->getNumOperands() - 1);
5453 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5456 case X86ISD::PSHUFD:
5457 case X86ISD::VPERMILPI:
5458 ImmN = N->getOperand(N->getNumOperands()-1);
5459 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5462 case X86ISD::PSHUFHW:
5463 ImmN = N->getOperand(N->getNumOperands()-1);
5464 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5467 case X86ISD::PSHUFLW:
5468 ImmN = N->getOperand(N->getNumOperands()-1);
5469 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5472 case X86ISD::VZEXT_MOVL:
5473 DecodeZeroMoveLowMask(VT, Mask);
5476 case X86ISD::VBROADCAST: {
5477 // We only decode broadcasts of same-sized vectors at the moment.
5478 if (N->getOperand(0).getValueType() == VT) {
5479 DecodeVectorBroadcast(VT, Mask);
5485 case X86ISD::VPERMILPV: {
5487 SDValue MaskNode = N->getOperand(1);
5488 unsigned MaskEltSize = VT.getScalarSizeInBits();
5489 SmallVector<uint64_t, 32> RawMask;
5490 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5491 DecodeVPERMILPMask(VT, RawMask, Mask);
5494 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5495 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5500 case X86ISD::PSHUFB: {
5502 SDValue MaskNode = N->getOperand(1);
5503 SmallVector<uint64_t, 32> RawMask;
5504 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5505 DecodePSHUFBMask(RawMask, Mask);
5508 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5509 DecodePSHUFBMask(C, Mask);
5514 case X86ISD::VPERMI:
5515 ImmN = N->getOperand(N->getNumOperands()-1);
5516 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5521 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5523 case X86ISD::VPERM2X128:
5524 ImmN = N->getOperand(N->getNumOperands()-1);
5525 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5526 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5528 case X86ISD::MOVSLDUP:
5529 DecodeMOVSLDUPMask(VT, Mask);
5532 case X86ISD::MOVSHDUP:
5533 DecodeMOVSHDUPMask(VT, Mask);
5536 case X86ISD::MOVDDUP:
5537 DecodeMOVDDUPMask(VT, Mask);
5540 case X86ISD::MOVLHPD:
5541 case X86ISD::MOVLPD:
5542 case X86ISD::MOVLPS:
5543 // Not yet implemented
5545 case X86ISD::VPERMIL2: {
5546 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5547 unsigned MaskEltSize = VT.getScalarSizeInBits();
5548 SDValue MaskNode = N->getOperand(2);
5549 SDValue CtrlNode = N->getOperand(3);
5550 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5551 unsigned CtrlImm = CtrlOp->getZExtValue();
5552 SmallVector<uint64_t, 32> RawMask;
5553 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5554 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5557 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5558 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5564 case X86ISD::VPPERM: {
5565 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5566 SDValue MaskNode = N->getOperand(2);
5567 SmallVector<uint64_t, 32> RawMask;
5568 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5569 DecodeVPPERMMask(RawMask, Mask);
5572 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5573 DecodeVPPERMMask(C, Mask);
5578 case X86ISD::VPERMV: {
5580 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5581 Ops.push_back(N->getOperand(1));
5582 SDValue MaskNode = N->getOperand(0);
5583 SmallVector<uint64_t, 32> RawMask;
5584 unsigned MaskEltSize = VT.getScalarSizeInBits();
5585 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5586 DecodeVPERMVMask(RawMask, Mask);
5589 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5590 DecodeVPERMVMask(C, MaskEltSize, Mask);
5595 case X86ISD::VPERMV3: {
5596 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5597 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5598 Ops.push_back(N->getOperand(0));
5599 Ops.push_back(N->getOperand(2));
5600 SDValue MaskNode = N->getOperand(1);
5601 unsigned MaskEltSize = VT.getScalarSizeInBits();
5602 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5603 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5608 case X86ISD::VPERMIV3: {
5609 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5610 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5611 Ops.push_back(N->getOperand(1));
5612 Ops.push_back(N->getOperand(2));
5613 SDValue MaskNode = N->getOperand(0);
5614 unsigned MaskEltSize = VT.getScalarSizeInBits();
5615 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5616 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5621 default: llvm_unreachable("unknown target shuffle node");
5624 // Empty mask indicates the decode failed.
5628 // Check if we're getting a shuffle mask with zero'd elements.
5629 if (!AllowSentinelZero)
5630 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5633 // If we have a fake unary shuffle, the shuffle mask is spread across two
5634 // inputs that are actually the same node. Re-map the mask to always point
5635 // into the first input.
5638 if (M >= (int)Mask.size())
5641 // If we didn't already add operands in the opcode-specific code, default to
5642 // adding 1 or 2 operands starting at 0.
5644 Ops.push_back(N->getOperand(0));
5645 if (!IsUnary || IsFakeUnary)
5646 Ops.push_back(N->getOperand(1));
5652 /// Check a target shuffle mask's inputs to see if we can set any values to
5653 /// SM_SentinelZero - this is for elements that are known to be zero
5654 /// (not just zeroable) from their inputs.
5655 /// Returns true if the target shuffle mask was decoded.
5656 static bool setTargetShuffleZeroElements(SDValue N,
5657 SmallVectorImpl<int> &Mask,
5658 SmallVectorImpl<SDValue> &Ops) {
5660 if (!isTargetShuffle(N.getOpcode()))
5663 MVT VT = N.getSimpleValueType();
5664 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5667 SDValue V1 = Ops[0];
5668 SDValue V2 = IsUnary ? V1 : Ops[1];
5670 V1 = peekThroughBitcasts(V1);
5671 V2 = peekThroughBitcasts(V2);
5673 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5676 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5680 // Determine shuffle input and normalize the mask.
5681 SDValue V = M < Size ? V1 : V2;
5684 // We are referencing an UNDEF input.
5686 Mask[i] = SM_SentinelUndef;
5690 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5691 if (V.getOpcode() != ISD::BUILD_VECTOR)
5694 // If the BUILD_VECTOR has fewer elements then the (larger) source
5695 // element must be UNDEF/ZERO.
5696 // TODO: Is it worth testing the individual bits of a constant?
5697 if ((Size % V.getNumOperands()) == 0) {
5698 int Scale = Size / V->getNumOperands();
5699 SDValue Op = V.getOperand(M / Scale);
5701 Mask[i] = SM_SentinelUndef;
5702 else if (X86::isZeroNode(Op))
5703 Mask[i] = SM_SentinelZero;
5707 // If the BUILD_VECTOR has more elements then all the (smaller) source
5708 // elements must be all UNDEF or all ZERO.
5709 if ((V.getNumOperands() % Size) == 0) {
5710 int Scale = V->getNumOperands() / Size;
5711 bool AllUndef = true;
5712 bool AllZero = true;
5713 for (int j = 0; j < Scale; ++j) {
5714 SDValue Op = V.getOperand((M * Scale) + j);
5715 AllUndef &= Op.isUndef();
5716 AllZero &= X86::isZeroNode(Op);
5719 Mask[i] = SM_SentinelUndef;
5721 Mask[i] = SM_SentinelZero;
5726 assert(VT.getVectorNumElements() == Mask.size() &&
5727 "Different mask size from vector size!");
5731 // Attempt to decode ops that could be represented as a shuffle mask.
5732 // The decoded shuffle mask may contain a different number of elements to the
5733 // destination value type.
5734 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5735 SmallVectorImpl<SDValue> &Ops) {
5739 MVT VT = N.getSimpleValueType();
5740 unsigned NumElts = VT.getVectorNumElements();
5741 unsigned NumSizeInBits = VT.getSizeInBits();
5742 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5743 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5744 "Expected byte aligned value types");
5746 unsigned Opcode = N.getOpcode();
5749 // Attempt to decode as a per-byte mask.
5750 SmallBitVector UndefElts;
5751 SmallVector<APInt, 32> EltBits;
5752 if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
5754 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5756 Mask.push_back(SM_SentinelUndef);
5759 uint64_t ByteBits = EltBits[i].getZExtValue();
5760 if (ByteBits != 0 && ByteBits != 255)
5762 Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
5764 Ops.push_back(N.getOperand(0));
5768 case X86ISD::VSRLI: {
5769 uint64_t ShiftVal = N.getConstantOperandVal(1);
5770 // Out of range bit shifts are guaranteed to be zero.
5771 if (NumBitsPerElt <= ShiftVal) {
5772 Mask.append(NumElts, SM_SentinelZero);
5776 // We can only decode 'whole byte' bit shifts as shuffles.
5777 if ((ShiftVal % 8) != 0)
5780 uint64_t ByteShift = ShiftVal / 8;
5781 unsigned NumBytes = NumSizeInBits / 8;
5782 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5783 Ops.push_back(N.getOperand(0));
5785 // Clear mask to all zeros and insert the shifted byte indices.
5786 Mask.append(NumBytes, SM_SentinelZero);
5788 if (X86ISD::VSHLI == Opcode) {
5789 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5790 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5791 Mask[i + j] = i + j - ByteShift;
5793 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5794 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5795 Mask[i + j - ByteShift] = i + j;
5799 case X86ISD::VZEXT: {
5800 // TODO - add support for VPMOVZX with smaller input vector types.
5801 SDValue Src = N.getOperand(0);
5802 MVT SrcVT = Src.getSimpleValueType();
5803 if (NumSizeInBits != SrcVT.getSizeInBits())
5805 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5814 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5815 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5816 /// remaining input indices in case we now have a unary shuffle and adjust the
5817 /// Op0/Op1 inputs accordingly.
5818 /// Returns true if the target shuffle mask was decoded.
5819 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5820 SmallVectorImpl<int> &Mask) {
5821 SmallVector<SDValue, 2> Ops;
5822 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5823 if (!getFauxShuffleMask(Op, Mask, Ops))
5826 int NumElts = Mask.size();
5827 bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
5828 return 0 <= Idx && Idx < NumElts;
5830 bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
5832 Op0 = Op0InUse ? Ops[0] : SDValue();
5833 Op1 = Op1InUse ? Ops[1] : SDValue();
5835 // We're only using Op1 - commute the mask and inputs.
5836 if (!Op0InUse && Op1InUse) {
5847 /// Returns the scalar element that will make up the ith
5848 /// element of the result of the vector shuffle.
5849 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5852 return SDValue(); // Limit search depth.
5854 SDValue V = SDValue(N, 0);
5855 EVT VT = V.getValueType();
5856 unsigned Opcode = V.getOpcode();
5858 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5859 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5860 int Elt = SV->getMaskElt(Index);
5863 return DAG.getUNDEF(VT.getVectorElementType());
5865 unsigned NumElems = VT.getVectorNumElements();
5866 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5867 : SV->getOperand(1);
5868 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5871 // Recurse into target specific vector shuffles to find scalars.
5872 if (isTargetShuffle(Opcode)) {
5873 MVT ShufVT = V.getSimpleValueType();
5874 MVT ShufSVT = ShufVT.getVectorElementType();
5875 int NumElems = (int)ShufVT.getVectorNumElements();
5876 SmallVector<int, 16> ShuffleMask;
5877 SmallVector<SDValue, 16> ShuffleOps;
5880 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5883 int Elt = ShuffleMask[Index];
5884 if (Elt == SM_SentinelZero)
5885 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5886 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5887 if (Elt == SM_SentinelUndef)
5888 return DAG.getUNDEF(ShufSVT);
5890 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5891 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5892 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5896 // Actual nodes that may contain scalar elements
5897 if (Opcode == ISD::BITCAST) {
5898 V = V.getOperand(0);
5899 EVT SrcVT = V.getValueType();
5900 unsigned NumElems = VT.getVectorNumElements();
5902 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5906 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5907 return (Index == 0) ? V.getOperand(0)
5908 : DAG.getUNDEF(VT.getVectorElementType());
5910 if (V.getOpcode() == ISD::BUILD_VECTOR)
5911 return V.getOperand(Index);
5916 /// Custom lower build_vector of v16i8.
5917 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5918 unsigned NumNonZero, unsigned NumZero,
5920 const X86Subtarget &Subtarget,
5921 const TargetLowering &TLI) {
5929 // SSE4.1 - use PINSRB to insert each byte directly.
5930 if (Subtarget.hasSSE41()) {
5931 for (unsigned i = 0; i < 16; ++i) {
5932 bool isNonZero = (NonZeros & (1 << i)) != 0;
5936 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5938 V = DAG.getUNDEF(MVT::v16i8);
5941 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5942 MVT::v16i8, V, Op.getOperand(i),
5943 DAG.getIntPtrConstant(i, dl));
5950 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5951 for (unsigned i = 0; i < 16; ++i) {
5952 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5953 if (ThisIsNonZero && First) {
5955 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5957 V = DAG.getUNDEF(MVT::v8i16);
5962 SDValue ThisElt, LastElt;
5963 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5964 if (LastIsNonZero) {
5965 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5966 MVT::i16, Op.getOperand(i-1));
5968 if (ThisIsNonZero) {
5969 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5970 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5971 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5973 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5977 if (ThisElt.getNode())
5978 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5979 DAG.getIntPtrConstant(i/2, dl));
5983 return DAG.getBitcast(MVT::v16i8, V);
5986 /// Custom lower build_vector of v8i16.
5987 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5988 unsigned NumNonZero, unsigned NumZero,
5990 const X86Subtarget &Subtarget,
5991 const TargetLowering &TLI) {
5998 for (unsigned i = 0; i < 8; ++i) {
5999 bool isNonZero = (NonZeros & (1 << i)) != 0;
6003 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6005 V = DAG.getUNDEF(MVT::v8i16);
6008 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
6009 MVT::v8i16, V, Op.getOperand(i),
6010 DAG.getIntPtrConstant(i, dl));
6017 /// Custom lower build_vector of v4i32 or v4f32.
6018 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6019 const X86Subtarget &Subtarget,
6020 const TargetLowering &TLI) {
6021 // Find all zeroable elements.
6022 std::bitset<4> Zeroable;
6023 for (int i=0; i < 4; ++i) {
6024 SDValue Elt = Op->getOperand(i);
6025 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6027 assert(Zeroable.size() - Zeroable.count() > 1 &&
6028 "We expect at least two non-zero elements!");
6030 // We only know how to deal with build_vector nodes where elements are either
6031 // zeroable or extract_vector_elt with constant index.
6032 SDValue FirstNonZero;
6033 unsigned FirstNonZeroIdx;
6034 for (unsigned i=0; i < 4; ++i) {
6037 SDValue Elt = Op->getOperand(i);
6038 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6039 !isa<ConstantSDNode>(Elt.getOperand(1)))
6041 // Make sure that this node is extracting from a 128-bit vector.
6042 MVT VT = Elt.getOperand(0).getSimpleValueType();
6043 if (!VT.is128BitVector())
6045 if (!FirstNonZero.getNode()) {
6047 FirstNonZeroIdx = i;
6051 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6052 SDValue V1 = FirstNonZero.getOperand(0);
6053 MVT VT = V1.getSimpleValueType();
6055 // See if this build_vector can be lowered as a blend with zero.
6057 unsigned EltMaskIdx, EltIdx;
6059 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6060 if (Zeroable[EltIdx]) {
6061 // The zero vector will be on the right hand side.
6062 Mask[EltIdx] = EltIdx+4;
6066 Elt = Op->getOperand(EltIdx);
6067 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6068 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6069 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6071 Mask[EltIdx] = EltIdx;
6075 // Let the shuffle legalizer deal with blend operations.
6076 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6077 if (V1.getSimpleValueType() != VT)
6078 V1 = DAG.getBitcast(VT, V1);
6079 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6082 // See if we can lower this build_vector to a INSERTPS.
6083 if (!Subtarget.hasSSE41())
6086 SDValue V2 = Elt.getOperand(0);
6087 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6090 bool CanFold = true;
6091 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6095 SDValue Current = Op->getOperand(i);
6096 SDValue SrcVector = Current->getOperand(0);
6099 CanFold = SrcVector == V1 &&
6100 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6106 assert(V1.getNode() && "Expected at least two non-zero elements!");
6107 if (V1.getSimpleValueType() != MVT::v4f32)
6108 V1 = DAG.getBitcast(MVT::v4f32, V1);
6109 if (V2.getSimpleValueType() != MVT::v4f32)
6110 V2 = DAG.getBitcast(MVT::v4f32, V2);
6112 // Ok, we can emit an INSERTPS instruction.
6113 unsigned ZMask = Zeroable.to_ulong();
6115 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6116 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6118 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6119 DAG.getIntPtrConstant(InsertPSMask, DL));
6120 return DAG.getBitcast(VT, Result);
6123 /// Return a vector logical shift node.
6124 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6125 SelectionDAG &DAG, const TargetLowering &TLI,
6127 assert(VT.is128BitVector() && "Unknown type for VShift");
6128 MVT ShVT = MVT::v16i8;
6129 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6130 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6131 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6132 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6133 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6134 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6137 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6138 SelectionDAG &DAG) {
6140 // Check if the scalar load can be widened into a vector load. And if
6141 // the address is "base + cst" see if the cst can be "absorbed" into
6142 // the shuffle mask.
6143 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6144 SDValue Ptr = LD->getBasePtr();
6145 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6147 EVT PVT = LD->getValueType(0);
6148 if (PVT != MVT::i32 && PVT != MVT::f32)
6153 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6154 FI = FINode->getIndex();
6156 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6157 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6158 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6159 Offset = Ptr.getConstantOperandVal(1);
6160 Ptr = Ptr.getOperand(0);
6165 // FIXME: 256-bit vector instructions don't require a strict alignment,
6166 // improve this code to support it better.
6167 unsigned RequiredAlign = VT.getSizeInBits()/8;
6168 SDValue Chain = LD->getChain();
6169 // Make sure the stack object alignment is at least 16 or 32.
6170 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6171 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6172 if (MFI.isFixedObjectIndex(FI)) {
6173 // Can't change the alignment. FIXME: It's possible to compute
6174 // the exact stack offset and reference FI + adjust offset instead.
6175 // If someone *really* cares about this. That's the way to implement it.
6178 MFI.setObjectAlignment(FI, RequiredAlign);
6182 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6183 // Ptr + (Offset & ~15).
6186 if ((Offset % RequiredAlign) & 3)
6188 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6191 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6192 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6195 int EltNo = (Offset - StartOffset) >> 2;
6196 unsigned NumElems = VT.getVectorNumElements();
6198 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6199 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6200 LD->getPointerInfo().getWithOffset(StartOffset));
6202 SmallVector<int, 8> Mask(NumElems, EltNo);
6204 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6210 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6211 /// elements can be replaced by a single large load which has the same value as
6212 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6214 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6215 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6216 SDLoc &DL, SelectionDAG &DAG,
6217 bool isAfterLegalize) {
6218 unsigned NumElems = Elts.size();
6220 int LastLoadedElt = -1;
6221 SmallBitVector LoadMask(NumElems, false);
6222 SmallBitVector ZeroMask(NumElems, false);
6223 SmallBitVector UndefMask(NumElems, false);
6225 // For each element in the initializer, see if we've found a load, zero or an
6227 for (unsigned i = 0; i < NumElems; ++i) {
6228 SDValue Elt = peekThroughBitcasts(Elts[i]);
6233 UndefMask[i] = true;
6234 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6236 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6239 // Each loaded element must be the correct fractional portion of the
6240 // requested vector load.
6241 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6246 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6247 "Incomplete element masks");
6249 // Handle Special Cases - all undef or undef/zero.
6250 if (UndefMask.count() == NumElems)
6251 return DAG.getUNDEF(VT);
6253 // FIXME: Should we return this as a BUILD_VECTOR instead?
6254 if ((ZeroMask | UndefMask).count() == NumElems)
6255 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6256 : DAG.getConstantFP(0.0, DL, VT);
6258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6259 int FirstLoadedElt = LoadMask.find_first();
6260 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6261 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6262 EVT LDBaseVT = EltBase.getValueType();
6264 // Consecutive loads can contain UNDEFS but not ZERO elements.
6265 // Consecutive loads with UNDEFs and ZEROs elements require a
6266 // an additional shuffle stage to clear the ZERO elements.
6267 bool IsConsecutiveLoad = true;
6268 bool IsConsecutiveLoadWithZeros = true;
6269 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6271 SDValue Elt = peekThroughBitcasts(Elts[i]);
6272 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6273 if (!DAG.areNonVolatileConsecutiveLoads(
6274 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6275 i - FirstLoadedElt)) {
6276 IsConsecutiveLoad = false;
6277 IsConsecutiveLoadWithZeros = false;
6280 } else if (ZeroMask[i]) {
6281 IsConsecutiveLoad = false;
6285 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6286 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6287 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6288 "Cannot merge volatile loads.");
6290 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6291 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6293 if (LDBase->hasAnyUseOfValue(1)) {
6295 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6296 SDValue(NewLd.getNode(), 1));
6297 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6298 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6299 SDValue(NewLd.getNode(), 1));
6305 // LOAD - all consecutive load/undefs (must start/end with a load).
6306 // If we have found an entire vector of loads and undefs, then return a large
6307 // load of the entire vector width starting at the base pointer.
6308 // If the vector contains zeros, then attempt to shuffle those elements.
6309 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6310 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6311 assert(LDBase && "Did not find base load for merging consecutive loads");
6312 EVT EltVT = LDBase->getValueType(0);
6313 // Ensure that the input vector size for the merged loads matches the
6314 // cumulative size of the input elements.
6315 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6318 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6321 if (IsConsecutiveLoad)
6322 return CreateLoad(VT, LDBase);
6324 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6325 // vector and a zero vector to clear out the zero elements.
6326 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6327 SmallVector<int, 4> ClearMask(NumElems, -1);
6328 for (unsigned i = 0; i < NumElems; ++i) {
6330 ClearMask[i] = i + NumElems;
6331 else if (LoadMask[i])
6334 SDValue V = CreateLoad(VT, LDBase);
6335 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6336 : DAG.getConstantFP(0.0, DL, VT);
6337 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6342 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6344 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6345 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6346 (LoadSize == 32 || LoadSize == 64) &&
6347 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6348 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6349 : MVT::getIntegerVT(LoadSize);
6350 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6351 if (TLI.isTypeLegal(VecVT)) {
6352 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6353 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6355 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6356 LDBase->getPointerInfo(),
6357 LDBase->getAlignment(),
6358 false/*isVolatile*/, true/*ReadMem*/,
6361 // Make sure the newly-created LOAD is in the same position as LDBase in
6362 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6363 // and update uses of LDBase's output chain to use the TokenFactor.
6364 if (LDBase->hasAnyUseOfValue(1)) {
6366 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6367 SDValue(ResNode.getNode(), 1));
6368 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6369 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6370 SDValue(ResNode.getNode(), 1));
6373 return DAG.getBitcast(VT, ResNode);
6380 static Constant *getConstantVector(MVT VT, APInt SplatValue,
6381 unsigned SplatBitSize, LLVMContext &C) {
6382 unsigned ScalarSize = VT.getScalarSizeInBits();
6383 unsigned NumElm = SplatBitSize / ScalarSize;
6385 SmallVector<Constant *, 32> ConstantVec;
6386 for (unsigned i = 0; i < NumElm; i++) {
6387 APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
6389 if (VT.isFloatingPoint()) {
6390 assert((ScalarSize == 32 || ScalarSize == 64) &&
6391 "Unsupported floating point scalar size");
6392 if (ScalarSize == 32)
6393 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6395 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6397 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6398 ConstantVec.push_back(Const);
6400 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6403 static bool isUseOfShuffle(SDNode *N) {
6404 for (auto *U : N->uses()) {
6405 if (isTargetShuffle(U->getOpcode()))
6407 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6408 return isUseOfShuffle(U);
6413 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6414 /// following cases:
6415 /// 1. A splat BUILD_VECTOR which uses:
6416 /// a. A single scalar load, or a constant.
6417 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6418 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6419 /// a scalar load, or a constant.
6421 /// The VBROADCAST node is returned when a pattern is found,
6422 /// or SDValue() otherwise.
6423 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6424 SelectionDAG &DAG) {
6425 // VBROADCAST requires AVX.
6426 // TODO: Splats could be generated for non-AVX CPUs using SSE
6427 // instructions, but there's less potential gain for only 128-bit vectors.
6428 if (!Subtarget.hasAVX())
6431 MVT VT = BVOp->getSimpleValueType(0);
6434 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6435 "Unsupported vector type for broadcast.");
6437 BitVector UndefElements;
6438 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6440 // We need a splat of a single value to use broadcast, and it doesn't
6441 // make any sense if the value is only in one element of the vector.
6442 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6443 APInt SplatValue, Undef;
6444 unsigned SplatBitSize;
6446 // Check if this is a repeated constant pattern suitable for broadcasting.
6447 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6448 SplatBitSize > VT.getScalarSizeInBits() &&
6449 SplatBitSize < VT.getSizeInBits()) {
6450 // Avoid replacing with broadcast when it's a use of a shuffle
6451 // instruction to preserve the present custom lowering of shuffles.
6452 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6454 // replace BUILD_VECTOR with broadcast of the repeated constants.
6455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6456 LLVMContext *Ctx = DAG.getContext();
6457 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6458 if (Subtarget.hasAVX()) {
6459 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6460 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6461 // Splatted value can fit in one INTEGER constant in constant pool.
6462 // Load the constant and broadcast it.
6463 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6464 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6465 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6466 SDValue CP = DAG.getConstantPool(C, PVT);
6467 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6469 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6471 CVT, dl, DAG.getEntryNode(), CP,
6472 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6474 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6475 MVT::getVectorVT(CVT, Repeat), Ld);
6476 return DAG.getBitcast(VT, Brdcst);
6477 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6478 // Splatted value can fit in one FLOAT constant in constant pool.
6479 // Load the constant and broadcast it.
6480 // AVX have support for 32 and 64 bit broadcast for floats only.
6481 // No 64bit integer in 32bit subtarget.
6482 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6483 Constant *C = SplatBitSize == 32
6484 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6485 SplatValue.bitsToFloat())
6486 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6487 SplatValue.bitsToDouble());
6488 SDValue CP = DAG.getConstantPool(C, PVT);
6489 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6491 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6493 CVT, dl, DAG.getEntryNode(), CP,
6494 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6496 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6497 MVT::getVectorVT(CVT, Repeat), Ld);
6498 return DAG.getBitcast(VT, Brdcst);
6499 } else if (SplatBitSize > 64) {
6500 // Load the vector of constants and broadcast it.
6501 MVT CVT = VT.getScalarType();
6502 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6504 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6505 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6506 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6508 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6509 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6511 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6512 return DAG.getBitcast(VT, Brdcst);
6519 bool ConstSplatVal =
6520 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6522 // Make sure that all of the users of a non-constant load are from the
6523 // BUILD_VECTOR node.
6524 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6527 unsigned ScalarSize = Ld.getValueSizeInBits();
6528 bool IsGE256 = (VT.getSizeInBits() >= 256);
6530 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6531 // instruction to save 8 or more bytes of constant pool data.
6532 // TODO: If multiple splats are generated to load the same constant,
6533 // it may be detrimental to overall size. There needs to be a way to detect
6534 // that condition to know if this is truly a size win.
6535 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6537 // Handle broadcasting a single constant scalar from the constant pool
6539 // On Sandybridge (no AVX2), it is still better to load a constant vector
6540 // from the constant pool and not to broadcast it from a scalar.
6541 // But override that restriction when optimizing for size.
6542 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6543 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6544 EVT CVT = Ld.getValueType();
6545 assert(!CVT.isVector() && "Must not broadcast a vector type");
6547 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6548 // For size optimization, also splat v2f64 and v2i64, and for size opt
6549 // with AVX2, also splat i8 and i16.
6550 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6551 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6552 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6553 const Constant *C = nullptr;
6554 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6555 C = CI->getConstantIntValue();
6556 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6557 C = CF->getConstantFPValue();
6559 assert(C && "Invalid constant type");
6561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6563 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6564 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6566 CVT, dl, DAG.getEntryNode(), CP,
6567 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6570 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6574 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6576 // Handle AVX2 in-register broadcasts.
6577 if (!IsLoad && Subtarget.hasInt256() &&
6578 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6579 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6581 // The scalar source must be a normal load.
6585 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6586 (Subtarget.hasVLX() && ScalarSize == 64))
6587 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6589 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6590 // double since there is no vbroadcastsd xmm
6591 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6592 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6593 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6596 // Unsupported broadcast.
6600 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6601 /// underlying vector and index.
6603 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6605 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6607 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6608 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6611 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6613 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6615 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6616 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6619 // In this case the vector is the extract_subvector expression and the index
6620 // is 2, as specified by the shuffle.
6621 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6622 SDValue ShuffleVec = SVOp->getOperand(0);
6623 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6624 assert(ShuffleVecVT.getVectorElementType() ==
6625 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6627 int ShuffleIdx = SVOp->getMaskElt(Idx);
6628 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6629 ExtractedFromVec = ShuffleVec;
6635 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6636 MVT VT = Op.getSimpleValueType();
6638 // Skip if insert_vec_elt is not supported.
6639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6640 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6644 unsigned NumElems = Op.getNumOperands();
6648 SmallVector<unsigned, 4> InsertIndices;
6649 SmallVector<int, 8> Mask(NumElems, -1);
6651 for (unsigned i = 0; i != NumElems; ++i) {
6652 unsigned Opc = Op.getOperand(i).getOpcode();
6654 if (Opc == ISD::UNDEF)
6657 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6658 // Quit if more than 1 elements need inserting.
6659 if (InsertIndices.size() > 1)
6662 InsertIndices.push_back(i);
6666 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6667 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6668 // Quit if non-constant index.
6669 if (!isa<ConstantSDNode>(ExtIdx))
6671 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6673 // Quit if extracted from vector of different type.
6674 if (ExtractedFromVec.getValueType() != VT)
6677 if (!VecIn1.getNode())
6678 VecIn1 = ExtractedFromVec;
6679 else if (VecIn1 != ExtractedFromVec) {
6680 if (!VecIn2.getNode())
6681 VecIn2 = ExtractedFromVec;
6682 else if (VecIn2 != ExtractedFromVec)
6683 // Quit if more than 2 vectors to shuffle
6687 if (ExtractedFromVec == VecIn1)
6689 else if (ExtractedFromVec == VecIn2)
6690 Mask[i] = Idx + NumElems;
6693 if (!VecIn1.getNode())
6696 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6697 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6698 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6699 unsigned Idx = InsertIndices[i];
6700 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6701 DAG.getIntPtrConstant(Idx, DL));
6707 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6708 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6709 Op.getScalarValueSizeInBits() == 1 &&
6710 "Can not convert non-constant vector");
6711 uint64_t Immediate = 0;
6712 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6713 SDValue In = Op.getOperand(idx);
6715 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6718 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6719 return DAG.getConstant(Immediate, dl, VT);
6721 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6723 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6725 MVT VT = Op.getSimpleValueType();
6726 assert((VT.getVectorElementType() == MVT::i1) &&
6727 "Unexpected type in LowerBUILD_VECTORvXi1!");
6730 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6731 return DAG.getTargetConstant(0, dl, VT);
6733 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6734 return DAG.getTargetConstant(1, dl, VT);
6736 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6737 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6738 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6739 return DAG.getBitcast(VT, Imm);
6740 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6741 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6742 DAG.getIntPtrConstant(0, dl));
6745 // Vector has one or more non-const elements
6746 uint64_t Immediate = 0;
6747 SmallVector<unsigned, 16> NonConstIdx;
6748 bool IsSplat = true;
6749 bool HasConstElts = false;
6751 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6752 SDValue In = Op.getOperand(idx);
6755 if (!isa<ConstantSDNode>(In))
6756 NonConstIdx.push_back(idx);
6758 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6759 HasConstElts = true;
6763 else if (In != Op.getOperand(SplatIdx))
6767 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6769 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6770 DAG.getConstant(1, dl, VT),
6771 DAG.getConstant(0, dl, VT));
6773 // insert elements one by one
6777 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6778 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6780 else if (HasConstElts)
6781 Imm = DAG.getConstant(0, dl, VT);
6783 Imm = DAG.getUNDEF(VT);
6784 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6785 DstVec = DAG.getBitcast(VT, Imm);
6787 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6788 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6789 DAG.getIntPtrConstant(0, dl));
6792 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6793 unsigned InsertIdx = NonConstIdx[i];
6794 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6795 Op.getOperand(InsertIdx),
6796 DAG.getIntPtrConstant(InsertIdx, dl));
6801 /// \brief Return true if \p N implements a horizontal binop and return the
6802 /// operands for the horizontal binop into V0 and V1.
6804 /// This is a helper function of LowerToHorizontalOp().
6805 /// This function checks that the build_vector \p N in input implements a
6806 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6807 /// operation to match.
6808 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6809 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6810 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6813 /// This function only analyzes elements of \p N whose indices are
6814 /// in range [BaseIdx, LastIdx).
6815 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6817 unsigned BaseIdx, unsigned LastIdx,
6818 SDValue &V0, SDValue &V1) {
6819 EVT VT = N->getValueType(0);
6821 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6822 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6823 "Invalid Vector in input!");
6825 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6826 bool CanFold = true;
6827 unsigned ExpectedVExtractIdx = BaseIdx;
6828 unsigned NumElts = LastIdx - BaseIdx;
6829 V0 = DAG.getUNDEF(VT);
6830 V1 = DAG.getUNDEF(VT);
6832 // Check if N implements a horizontal binop.
6833 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6834 SDValue Op = N->getOperand(i + BaseIdx);
6837 if (Op->isUndef()) {
6838 // Update the expected vector extract index.
6839 if (i * 2 == NumElts)
6840 ExpectedVExtractIdx = BaseIdx;
6841 ExpectedVExtractIdx += 2;
6845 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6850 SDValue Op0 = Op.getOperand(0);
6851 SDValue Op1 = Op.getOperand(1);
6853 // Try to match the following pattern:
6854 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6855 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6856 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6857 Op0.getOperand(0) == Op1.getOperand(0) &&
6858 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6859 isa<ConstantSDNode>(Op1.getOperand(1)));
6863 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6864 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6866 if (i * 2 < NumElts) {
6868 V0 = Op0.getOperand(0);
6869 if (V0.getValueType() != VT)
6874 V1 = Op0.getOperand(0);
6875 if (V1.getValueType() != VT)
6878 if (i * 2 == NumElts)
6879 ExpectedVExtractIdx = BaseIdx;
6882 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6883 if (I0 == ExpectedVExtractIdx)
6884 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6885 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6886 // Try to match the following dag sequence:
6887 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6888 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6892 ExpectedVExtractIdx += 2;
6898 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6899 /// a concat_vector.
6901 /// This is a helper function of LowerToHorizontalOp().
6902 /// This function expects two 256-bit vectors called V0 and V1.
6903 /// At first, each vector is split into two separate 128-bit vectors.
6904 /// Then, the resulting 128-bit vectors are used to implement two
6905 /// horizontal binary operations.
6907 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6909 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6910 /// the two new horizontal binop.
6911 /// When Mode is set, the first horizontal binop dag node would take as input
6912 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6913 /// horizontal binop dag node would take as input the lower 128-bit of V1
6914 /// and the upper 128-bit of V1.
6916 /// HADD V0_LO, V0_HI
6917 /// HADD V1_LO, V1_HI
6919 /// Otherwise, the first horizontal binop dag node takes as input the lower
6920 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6921 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6923 /// HADD V0_LO, V1_LO
6924 /// HADD V0_HI, V1_HI
6926 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6927 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6928 /// the upper 128-bits of the result.
6929 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6930 const SDLoc &DL, SelectionDAG &DAG,
6931 unsigned X86Opcode, bool Mode,
6932 bool isUndefLO, bool isUndefHI) {
6933 MVT VT = V0.getSimpleValueType();
6934 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6935 "Invalid nodes in input!");
6937 unsigned NumElts = VT.getVectorNumElements();
6938 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6939 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6940 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6941 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6942 MVT NewVT = V0_LO.getSimpleValueType();
6944 SDValue LO = DAG.getUNDEF(NewVT);
6945 SDValue HI = DAG.getUNDEF(NewVT);
6948 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6949 if (!isUndefLO && !V0->isUndef())
6950 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6951 if (!isUndefHI && !V1->isUndef())
6952 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6954 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6955 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6956 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6958 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6959 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6962 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6965 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6967 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6968 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6969 MVT VT = BV->getSimpleValueType(0);
6970 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6971 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6975 unsigned NumElts = VT.getVectorNumElements();
6976 SDValue InVec0 = DAG.getUNDEF(VT);
6977 SDValue InVec1 = DAG.getUNDEF(VT);
6979 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6980 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6982 // Odd-numbered elements in the input build vector are obtained from
6983 // adding two integer/float elements.
6984 // Even-numbered elements in the input build vector are obtained from
6985 // subtracting two integer/float elements.
6986 unsigned ExpectedOpcode = ISD::FSUB;
6987 unsigned NextExpectedOpcode = ISD::FADD;
6988 bool AddFound = false;
6989 bool SubFound = false;
6991 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6992 SDValue Op = BV->getOperand(i);
6994 // Skip 'undef' values.
6995 unsigned Opcode = Op.getOpcode();
6996 if (Opcode == ISD::UNDEF) {
6997 std::swap(ExpectedOpcode, NextExpectedOpcode);
7001 // Early exit if we found an unexpected opcode.
7002 if (Opcode != ExpectedOpcode)
7005 SDValue Op0 = Op.getOperand(0);
7006 SDValue Op1 = Op.getOperand(1);
7008 // Try to match the following pattern:
7009 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7010 // Early exit if we cannot match that sequence.
7011 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7012 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7013 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7014 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7015 Op0.getOperand(1) != Op1.getOperand(1))
7018 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7022 // We found a valid add/sub node. Update the information accordingly.
7028 // Update InVec0 and InVec1.
7029 if (InVec0.isUndef()) {
7030 InVec0 = Op0.getOperand(0);
7031 if (InVec0.getSimpleValueType() != VT)
7034 if (InVec1.isUndef()) {
7035 InVec1 = Op1.getOperand(0);
7036 if (InVec1.getSimpleValueType() != VT)
7040 // Make sure that operands in input to each add/sub node always
7041 // come from a same pair of vectors.
7042 if (InVec0 != Op0.getOperand(0)) {
7043 if (ExpectedOpcode == ISD::FSUB)
7046 // FADD is commutable. Try to commute the operands
7047 // and then test again.
7048 std::swap(Op0, Op1);
7049 if (InVec0 != Op0.getOperand(0))
7053 if (InVec1 != Op1.getOperand(0))
7056 // Update the pair of expected opcodes.
7057 std::swap(ExpectedOpcode, NextExpectedOpcode);
7060 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7061 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
7062 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
7067 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7068 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7069 const X86Subtarget &Subtarget,
7070 SelectionDAG &DAG) {
7071 MVT VT = BV->getSimpleValueType(0);
7072 unsigned NumElts = VT.getVectorNumElements();
7073 unsigned NumUndefsLO = 0;
7074 unsigned NumUndefsHI = 0;
7075 unsigned Half = NumElts/2;
7077 // Count the number of UNDEF operands in the build_vector in input.
7078 for (unsigned i = 0, e = Half; i != e; ++i)
7079 if (BV->getOperand(i)->isUndef())
7082 for (unsigned i = Half, e = NumElts; i != e; ++i)
7083 if (BV->getOperand(i)->isUndef())
7086 // Early exit if this is either a build_vector of all UNDEFs or all the
7087 // operands but one are UNDEF.
7088 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7092 SDValue InVec0, InVec1;
7093 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7094 // Try to match an SSE3 float HADD/HSUB.
7095 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7096 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7098 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7099 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7100 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7101 // Try to match an SSSE3 integer HADD/HSUB.
7102 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7103 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7105 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7106 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7109 if (!Subtarget.hasAVX())
7112 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7113 // Try to match an AVX horizontal add/sub of packed single/double
7114 // precision floating point values from 256-bit vectors.
7115 SDValue InVec2, InVec3;
7116 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7117 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7118 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7119 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7120 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7122 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7123 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7124 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7125 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7126 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7127 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7128 // Try to match an AVX2 horizontal add/sub of signed integers.
7129 SDValue InVec2, InVec3;
7131 bool CanFold = true;
7133 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7134 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7135 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7136 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7137 X86Opcode = X86ISD::HADD;
7138 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7139 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7140 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7141 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7142 X86Opcode = X86ISD::HSUB;
7147 // Fold this build_vector into a single horizontal add/sub.
7148 // Do this only if the target has AVX2.
7149 if (Subtarget.hasAVX2())
7150 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7152 // Do not try to expand this build_vector into a pair of horizontal
7153 // add/sub if we can emit a pair of scalar add/sub.
7154 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7157 // Convert this build_vector into a pair of horizontal binop followed by
7159 bool isUndefLO = NumUndefsLO == Half;
7160 bool isUndefHI = NumUndefsHI == Half;
7161 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7162 isUndefLO, isUndefHI);
7166 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7167 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7169 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7170 X86Opcode = X86ISD::HADD;
7171 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7172 X86Opcode = X86ISD::HSUB;
7173 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7174 X86Opcode = X86ISD::FHADD;
7175 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7176 X86Opcode = X86ISD::FHSUB;
7180 // Don't try to expand this build_vector into a pair of horizontal add/sub
7181 // if we can simply emit a pair of scalar add/sub.
7182 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7185 // Convert this build_vector into two horizontal add/sub followed by
7187 bool isUndefLO = NumUndefsLO == Half;
7188 bool isUndefHI = NumUndefsHI == Half;
7189 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7190 isUndefLO, isUndefHI);
7196 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7197 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7198 /// just apply the bit to the vectors.
7199 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7200 /// from this, but enough scalar bit operations are created from the later
7201 /// legalization + scalarization stages to need basic support.
7202 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7203 SelectionDAG &DAG) {
7205 MVT VT = Op->getSimpleValueType(0);
7206 unsigned NumElems = VT.getVectorNumElements();
7207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7209 // Check that all elements have the same opcode.
7210 // TODO: Should we allow UNDEFS and if so how many?
7211 unsigned Opcode = Op->getOperand(0).getOpcode();
7212 for (unsigned i = 1; i < NumElems; ++i)
7213 if (Opcode != Op->getOperand(i).getOpcode())
7216 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7223 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7228 SmallVector<SDValue, 4> LHSElts, RHSElts;
7229 for (SDValue Elt : Op->ops()) {
7230 SDValue LHS = Elt.getOperand(0);
7231 SDValue RHS = Elt.getOperand(1);
7233 // We expect the canonicalized RHS operand to be the constant.
7234 if (!isa<ConstantSDNode>(RHS))
7236 LHSElts.push_back(LHS);
7237 RHSElts.push_back(RHS);
7240 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7241 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7242 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7245 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7246 /// functionality to do this, so it's all zeros, all ones, or some derivation
7247 /// that is cheap to calculate.
7248 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7249 const X86Subtarget &Subtarget) {
7251 MVT VT = Op.getSimpleValueType();
7253 // Vectors containing all zeros can be matched by pxor and xorps.
7254 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7255 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7256 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7257 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7260 return getZeroVector(VT, Subtarget, DAG, DL);
7263 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7264 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7265 // vpcmpeqd on 256-bit vectors.
7266 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7267 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7268 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7271 return getOnesVector(VT, Subtarget, DAG, DL);
7278 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7281 MVT VT = Op.getSimpleValueType();
7282 MVT ExtVT = VT.getVectorElementType();
7283 unsigned NumElems = Op.getNumOperands();
7285 // Generate vectors for predicate vectors.
7286 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7287 return LowerBUILD_VECTORvXi1(Op, DAG);
7289 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7290 return VectorConstant;
7292 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7293 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
7295 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7296 return HorizontalOp;
7297 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7299 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7302 unsigned EVTBits = ExtVT.getSizeInBits();
7304 unsigned NumZero = 0;
7305 unsigned NumNonZero = 0;
7306 uint64_t NonZeros = 0;
7307 bool IsAllConstants = true;
7308 SmallSet<SDValue, 8> Values;
7309 for (unsigned i = 0; i < NumElems; ++i) {
7310 SDValue Elt = Op.getOperand(i);
7314 if (Elt.getOpcode() != ISD::Constant &&
7315 Elt.getOpcode() != ISD::ConstantFP)
7316 IsAllConstants = false;
7317 if (X86::isZeroNode(Elt))
7320 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7321 NonZeros |= ((uint64_t)1 << i);
7326 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7327 if (NumNonZero == 0)
7328 return DAG.getUNDEF(VT);
7330 // Special case for single non-zero, non-undef, element.
7331 if (NumNonZero == 1) {
7332 unsigned Idx = countTrailingZeros(NonZeros);
7333 SDValue Item = Op.getOperand(Idx);
7335 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7336 // the value are obviously zero, truncate the value to i32 and do the
7337 // insertion that way. Only do this if the value is non-constant or if the
7338 // value is a constant being inserted into element 0. It is cheaper to do
7339 // a constant pool load than it is to do a movd + shuffle.
7340 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7341 (!IsAllConstants || Idx == 0)) {
7342 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
7344 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7345 MVT VecVT = MVT::v4i32;
7347 // Truncate the value (which may itself be a constant) to i32, and
7348 // convert it to a vector with movd (S2V+shuffle to zero extend).
7349 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7350 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7351 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7352 Item, Idx * 2, true, Subtarget, DAG));
7356 // If we have a constant or non-constant insertion into the low element of
7357 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7358 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7359 // depending on what the source datatype is.
7362 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7364 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7365 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7366 assert((VT.is128BitVector() || VT.is256BitVector() ||
7367 VT.is512BitVector()) &&
7368 "Expected an SSE value type!");
7369 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7370 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7371 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7374 // We can't directly insert an i8 or i16 into a vector, so zero extend
7376 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7377 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7378 if (VT.getSizeInBits() >= 256) {
7379 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7380 if (Subtarget.hasAVX()) {
7381 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7382 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7384 // Without AVX, we need to extend to a 128-bit vector and then
7385 // insert into the 256-bit vector.
7386 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7387 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7388 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7391 assert(VT.is128BitVector() && "Expected an SSE value type!");
7392 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7393 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7395 return DAG.getBitcast(VT, Item);
7399 // Is it a vector logical left shift?
7400 if (NumElems == 2 && Idx == 1 &&
7401 X86::isZeroNode(Op.getOperand(0)) &&
7402 !X86::isZeroNode(Op.getOperand(1))) {
7403 unsigned NumBits = VT.getSizeInBits();
7404 return getVShift(true, VT,
7405 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7406 VT, Op.getOperand(1)),
7407 NumBits/2, DAG, *this, dl);
7410 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7413 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7414 // is a non-constant being inserted into an element other than the low one,
7415 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7416 // movd/movss) to move this into the low element, then shuffle it into
7418 if (EVTBits == 32) {
7419 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7420 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7424 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7425 if (Values.size() == 1) {
7426 if (EVTBits == 32) {
7427 // Instead of a shuffle like this:
7428 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7429 // Check if it's possible to issue this instead.
7430 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7431 unsigned Idx = countTrailingZeros(NonZeros);
7432 SDValue Item = Op.getOperand(Idx);
7433 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7434 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7439 // A vector full of immediates; various special cases are already
7440 // handled, so this is best done with a single constant-pool load.
7444 // See if we can use a vector load to get all of the elements.
7445 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7446 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7447 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7451 // For AVX-length vectors, build the individual 128-bit pieces and use
7452 // shuffles to put them in place.
7453 if (VT.is256BitVector() || VT.is512BitVector()) {
7454 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7456 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7458 // Build both the lower and upper subvector.
7460 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7461 SDValue Upper = DAG.getBuildVector(
7462 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7464 // Recreate the wider vector with the lower and upper part.
7465 if (VT.is256BitVector())
7466 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7467 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7470 // Let legalizer expand 2-wide build_vectors.
7471 if (EVTBits == 64) {
7472 if (NumNonZero == 1) {
7473 // One half is zero or undef.
7474 unsigned Idx = countTrailingZeros(NonZeros);
7475 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7476 Op.getOperand(Idx));
7477 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7482 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7483 if (EVTBits == 8 && NumElems == 16)
7484 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7485 DAG, Subtarget, *this))
7488 if (EVTBits == 16 && NumElems == 8)
7489 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7490 DAG, Subtarget, *this))
7493 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7494 if (EVTBits == 32 && NumElems == 4)
7495 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
7498 // If element VT is == 32 bits, turn it into a number of shuffles.
7499 if (NumElems == 4 && NumZero > 0) {
7500 SmallVector<SDValue, 8> Ops(NumElems);
7501 for (unsigned i = 0; i < 4; ++i) {
7502 bool isZero = !(NonZeros & (1ULL << i));
7504 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7506 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7509 for (unsigned i = 0; i < 2; ++i) {
7510 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7513 Ops[i] = Ops[i*2]; // Must be a zero vector.
7516 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7519 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7522 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7527 bool Reverse1 = (NonZeros & 0x3) == 2;
7528 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7532 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7533 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7535 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7538 if (Values.size() > 1 && VT.is128BitVector()) {
7539 // Check for a build vector from mostly shuffle plus few inserting.
7540 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7543 // For SSE 4.1, use insertps to put the high elements into the low element.
7544 if (Subtarget.hasSSE41()) {
7546 if (!Op.getOperand(0).isUndef())
7547 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7549 Result = DAG.getUNDEF(VT);
7551 for (unsigned i = 1; i < NumElems; ++i) {
7552 if (Op.getOperand(i).isUndef()) continue;
7553 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7554 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7559 // Otherwise, expand into a number of unpckl*, start by extending each of
7560 // our (non-undef) elements to the full vector width with the element in the
7561 // bottom slot of the vector (which generates no code for SSE).
7562 SmallVector<SDValue, 8> Ops(NumElems);
7563 for (unsigned i = 0; i < NumElems; ++i) {
7564 if (!Op.getOperand(i).isUndef())
7565 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7567 Ops[i] = DAG.getUNDEF(VT);
7570 // Next, we iteratively mix elements, e.g. for v4f32:
7571 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7572 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7573 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7574 unsigned EltStride = NumElems >> 1;
7575 while (EltStride != 0) {
7576 for (unsigned i = 0; i < EltStride; ++i) {
7577 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7578 // then it is safe to just drop this shuffle: V[i] is already in the
7579 // right place, the one element (since it's the first round) being
7580 // inserted as undef can be dropped. This isn't safe for successive
7581 // rounds because they will permute elements within both vectors.
7582 if (Ops[i+EltStride].isUndef() &&
7583 EltStride == NumElems/2)
7586 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7595 // 256-bit AVX can use the vinsertf128 instruction
7596 // to create 256-bit vectors from two other 128-bit ones.
7597 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7599 MVT ResVT = Op.getSimpleValueType();
7601 assert((ResVT.is256BitVector() ||
7602 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7604 SDValue V1 = Op.getOperand(0);
7605 SDValue V2 = Op.getOperand(1);
7606 unsigned NumElems = ResVT.getVectorNumElements();
7607 if (ResVT.is256BitVector())
7608 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7610 if (Op.getNumOperands() == 4) {
7611 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7612 ResVT.getVectorNumElements()/2);
7613 SDValue V3 = Op.getOperand(2);
7614 SDValue V4 = Op.getOperand(3);
7615 return concat256BitVectors(
7616 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7617 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7620 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7623 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7624 const X86Subtarget &Subtarget,
7625 SelectionDAG & DAG) {
7627 MVT ResVT = Op.getSimpleValueType();
7628 unsigned NumOfOperands = Op.getNumOperands();
7630 assert(isPowerOf2_32(NumOfOperands) &&
7631 "Unexpected number of operands in CONCAT_VECTORS");
7633 SDValue Undef = DAG.getUNDEF(ResVT);
7634 if (NumOfOperands > 2) {
7635 // Specialize the cases when all, or all but one, of the operands are undef.
7636 unsigned NumOfDefinedOps = 0;
7638 for (unsigned i = 0; i < NumOfOperands; i++)
7639 if (!Op.getOperand(i).isUndef()) {
7643 if (NumOfDefinedOps == 0)
7645 if (NumOfDefinedOps == 1) {
7646 unsigned SubVecNumElts =
7647 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7648 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7649 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7650 Op.getOperand(OpIdx), IdxVal);
7653 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7654 ResVT.getVectorNumElements()/2);
7655 SmallVector<SDValue, 2> Ops;
7656 for (unsigned i = 0; i < NumOfOperands/2; i++)
7657 Ops.push_back(Op.getOperand(i));
7658 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7660 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7661 Ops.push_back(Op.getOperand(i));
7662 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7663 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7667 SDValue V1 = Op.getOperand(0);
7668 SDValue V2 = Op.getOperand(1);
7669 unsigned NumElems = ResVT.getVectorNumElements();
7670 assert(V1.getValueType() == V2.getValueType() &&
7671 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7672 "Unexpected operands in CONCAT_VECTORS");
7674 if (ResVT.getSizeInBits() >= 16)
7675 return Op; // The operation is legal with KUNPCK
7677 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7678 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7679 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7680 if (IsZeroV1 && IsZeroV2)
7683 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7685 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7687 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7689 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7691 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7694 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7696 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7697 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7700 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7701 const X86Subtarget &Subtarget,
7702 SelectionDAG &DAG) {
7703 MVT VT = Op.getSimpleValueType();
7704 if (VT.getVectorElementType() == MVT::i1)
7705 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7707 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7708 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7709 Op.getNumOperands() == 4)));
7711 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7712 // from two other 128-bit ones.
7714 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7715 return LowerAVXCONCAT_VECTORS(Op, DAG);
7718 //===----------------------------------------------------------------------===//
7719 // Vector shuffle lowering
7721 // This is an experimental code path for lowering vector shuffles on x86. It is
7722 // designed to handle arbitrary vector shuffles and blends, gracefully
7723 // degrading performance as necessary. It works hard to recognize idiomatic
7724 // shuffles and lower them to optimal instruction patterns without leaving
7725 // a framework that allows reasonably efficient handling of all vector shuffle
7727 //===----------------------------------------------------------------------===//
7729 /// \brief Tiny helper function to identify a no-op mask.
7731 /// This is a somewhat boring predicate function. It checks whether the mask
7732 /// array input, which is assumed to be a single-input shuffle mask of the kind
7733 /// used by the X86 shuffle instructions (not a fully general
7734 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7735 /// in-place shuffle are 'no-op's.
7736 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7737 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7738 assert(Mask[i] >= -1 && "Out of bound mask element!");
7739 if (Mask[i] >= 0 && Mask[i] != i)
7745 /// \brief Test whether there are elements crossing 128-bit lanes in this
7748 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7749 /// and we routinely test for these.
7750 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7751 int LaneSize = 128 / VT.getScalarSizeInBits();
7752 int Size = Mask.size();
7753 for (int i = 0; i < Size; ++i)
7754 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7759 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7761 /// This checks a shuffle mask to see if it is performing the same
7762 /// lane-relative shuffle in each sub-lane. This trivially implies
7763 /// that it is also not lane-crossing. It may however involve a blend from the
7764 /// same lane of a second vector.
7766 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7767 /// non-trivial to compute in the face of undef lanes. The representation is
7768 /// suitable for use with existing 128-bit shuffles as entries from the second
7769 /// vector have been remapped to [LaneSize, 2*LaneSize).
7770 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7772 SmallVectorImpl<int> &RepeatedMask) {
7773 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7774 RepeatedMask.assign(LaneSize, -1);
7775 int Size = Mask.size();
7776 for (int i = 0; i < Size; ++i) {
7777 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
7780 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7781 // This entry crosses lanes, so there is no way to model this shuffle.
7784 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7785 // Adjust second vector indices to start at LaneSize instead of Size.
7786 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7787 : Mask[i] % LaneSize + LaneSize;
7788 if (RepeatedMask[i % LaneSize] < 0)
7789 // This is the first non-undef entry in this slot of a 128-bit lane.
7790 RepeatedMask[i % LaneSize] = LocalM;
7791 else if (RepeatedMask[i % LaneSize] != LocalM)
7792 // Found a mismatch with the repeated mask.
7798 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7800 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7801 SmallVectorImpl<int> &RepeatedMask) {
7802 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7805 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7807 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7808 SmallVectorImpl<int> &RepeatedMask) {
7809 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7812 /// Test whether a target shuffle mask is equivalent within each sub-lane.
7813 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
7814 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
7816 SmallVectorImpl<int> &RepeatedMask) {
7817 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7818 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
7819 int Size = Mask.size();
7820 for (int i = 0; i < Size; ++i) {
7821 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
7822 if (Mask[i] == SM_SentinelUndef)
7824 if (Mask[i] == SM_SentinelZero) {
7825 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
7827 RepeatedMask[i % LaneSize] = SM_SentinelZero;
7830 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7831 // This entry crosses lanes, so there is no way to model this shuffle.
7834 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7835 // Adjust second vector indices to start at LaneSize instead of Size.
7837 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
7838 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
7839 // This is the first non-undef entry in this slot of a 128-bit lane.
7840 RepeatedMask[i % LaneSize] = LocalM;
7841 else if (RepeatedMask[i % LaneSize] != LocalM)
7842 // Found a mismatch with the repeated mask.
7848 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7851 /// This is a fast way to test a shuffle mask against a fixed pattern:
7853 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7855 /// It returns true if the mask is exactly as wide as the argument list, and
7856 /// each element of the mask is either -1 (signifying undef) or the value given
7857 /// in the argument.
7858 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7859 ArrayRef<int> ExpectedMask) {
7860 if (Mask.size() != ExpectedMask.size())
7863 int Size = Mask.size();
7865 // If the values are build vectors, we can look through them to find
7866 // equivalent inputs that make the shuffles equivalent.
7867 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7868 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7870 for (int i = 0; i < Size; ++i) {
7871 assert(Mask[i] >= -1 && "Out of bound mask element!");
7872 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7873 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7874 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7875 if (!MaskBV || !ExpectedBV ||
7876 MaskBV->getOperand(Mask[i] % Size) !=
7877 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7885 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7887 /// The masks must be exactly the same width.
7889 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7890 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7892 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7893 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7894 ArrayRef<int> ExpectedMask) {
7895 int Size = Mask.size();
7896 if (Size != (int)ExpectedMask.size())
7899 for (int i = 0; i < Size; ++i)
7900 if (Mask[i] == SM_SentinelUndef)
7902 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7904 else if (Mask[i] != ExpectedMask[i])
7910 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7912 /// This helper function produces an 8-bit shuffle immediate corresponding to
7913 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7914 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7917 /// NB: We rely heavily on "undef" masks preserving the input lane.
7918 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7919 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7920 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7921 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7922 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7923 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7926 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7927 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7928 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7929 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7933 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7934 SelectionDAG &DAG) {
7935 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7938 /// \brief Compute whether each element of a shuffle is zeroable.
7940 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7941 /// Either it is an undef element in the shuffle mask, the element of the input
7942 /// referenced is undef, or the element of the input referenced is known to be
7943 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7944 /// as many lanes with this technique as possible to simplify the remaining
7946 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7947 SDValue V1, SDValue V2) {
7948 SmallBitVector Zeroable(Mask.size(), false);
7949 V1 = peekThroughBitcasts(V1);
7950 V2 = peekThroughBitcasts(V2);
7952 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7953 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7955 int VectorSizeInBits = V1.getValueSizeInBits();
7956 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7957 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7959 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7961 // Handle the easy cases.
7962 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7967 // Determine shuffle input and normalize the mask.
7968 SDValue V = M < Size ? V1 : V2;
7971 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7972 if (V.getOpcode() != ISD::BUILD_VECTOR)
7975 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7976 // the (larger) source element must be UNDEF/ZERO.
7977 if ((Size % V.getNumOperands()) == 0) {
7978 int Scale = Size / V->getNumOperands();
7979 SDValue Op = V.getOperand(M / Scale);
7980 if (Op.isUndef() || X86::isZeroNode(Op))
7982 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7983 APInt Val = Cst->getAPIntValue();
7984 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7985 Val = Val.getLoBits(ScalarSizeInBits);
7986 Zeroable[i] = (Val == 0);
7987 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7988 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7989 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7990 Val = Val.getLoBits(ScalarSizeInBits);
7991 Zeroable[i] = (Val == 0);
7996 // If the BUILD_VECTOR has more elements then all the (smaller) source
7997 // elements must be UNDEF or ZERO.
7998 if ((V.getNumOperands() % Size) == 0) {
7999 int Scale = V->getNumOperands() / Size;
8000 bool AllZeroable = true;
8001 for (int j = 0; j < Scale; ++j) {
8002 SDValue Op = V.getOperand((M * Scale) + j);
8003 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8005 Zeroable[i] = AllZeroable;
8013 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8014 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8015 ArrayRef<int> Mask, SDValue V1,
8017 const SmallBitVector &Zeroable,
8018 const X86Subtarget &Subtarget,
8019 SelectionDAG &DAG) {
8020 int Size = Mask.size();
8021 int LaneSize = 128 / VT.getScalarSizeInBits();
8022 const int NumBytes = VT.getSizeInBits() / 8;
8023 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8025 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8026 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8027 (Subtarget.hasBWI() && VT.is512BitVector()));
8029 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8030 // Sign bit set in i8 mask means zero element.
8031 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8034 for (int i = 0; i < NumBytes; ++i) {
8035 int M = Mask[i / NumEltBytes];
8037 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8040 if (Zeroable[i / NumEltBytes]) {
8041 PSHUFBMask[i] = ZeroMask;
8045 // We can only use a single input of V1 or V2.
8046 SDValue SrcV = (M >= Size ? V2 : V1);
8052 // PSHUFB can't cross lanes, ensure this doesn't happen.
8053 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8057 M = M * NumEltBytes + (i % NumEltBytes);
8058 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8060 assert(V && "Failed to find a source input");
8062 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8063 return DAG.getBitcast(
8064 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8065 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8068 // X86 has dedicated unpack instructions that can handle specific blend
8069 // operations: UNPCKH and UNPCKL.
8070 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8071 ArrayRef<int> Mask, SDValue V1,
8072 SDValue V2, SelectionDAG &DAG) {
8073 SmallVector<int, 8> Unpckl;
8074 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8075 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8076 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8078 SmallVector<int, 8> Unpckh;
8079 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8080 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8081 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8083 // Commute and try again.
8084 ShuffleVectorSDNode::commuteMask(Unpckl);
8085 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8086 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8088 ShuffleVectorSDNode::commuteMask(Unpckh);
8089 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8090 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8095 /// \brief Try to emit a bitmask instruction for a shuffle.
8097 /// This handles cases where we can model a blend exactly as a bitmask due to
8098 /// one of the inputs being zeroable.
8099 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8100 SDValue V2, ArrayRef<int> Mask,
8101 const SmallBitVector &Zeroable,
8102 SelectionDAG &DAG) {
8103 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8104 MVT EltVT = VT.getVectorElementType();
8105 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8107 DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
8108 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8110 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8113 if (Mask[i] % Size != i)
8114 return SDValue(); // Not a blend.
8116 V = Mask[i] < Size ? V1 : V2;
8117 else if (V != (Mask[i] < Size ? V1 : V2))
8118 return SDValue(); // Can only let one input through the mask.
8120 VMaskOps[i] = AllOnes;
8123 return SDValue(); // No non-zeroable elements!
8125 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8126 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8129 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8131 /// This is used as a fallback approach when first class blend instructions are
8132 /// unavailable. Currently it is only suitable for integer vectors, but could
8133 /// be generalized for floating point vectors if desirable.
8134 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8135 SDValue V2, ArrayRef<int> Mask,
8136 SelectionDAG &DAG) {
8137 assert(VT.isInteger() && "Only supports integer vector types!");
8138 MVT EltVT = VT.getVectorElementType();
8139 int NumEltBits = EltVT.getSizeInBits();
8140 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8141 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
8143 SmallVector<SDValue, 16> MaskOps;
8144 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8145 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8146 return SDValue(); // Shuffled input!
8147 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8150 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8151 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8152 // We have to cast V2 around.
8153 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8154 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8155 DAG.getBitcast(MaskVT, V1Mask),
8156 DAG.getBitcast(MaskVT, V2)));
8157 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8160 /// \brief Try to emit a blend instruction for a shuffle.
8162 /// This doesn't do any checks for the availability of instructions for blending
8163 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8164 /// be matched in the backend with the type given. What it does check for is
8165 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8166 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8167 SDValue V2, ArrayRef<int> Original,
8168 const SmallBitVector &Zeroable,
8169 const X86Subtarget &Subtarget,
8170 SelectionDAG &DAG) {
8171 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8172 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8173 SmallVector<int, 8> Mask(Original.begin(), Original.end());
8174 bool ForceV1Zero = false, ForceV2Zero = false;
8176 // Attempt to generate the binary blend mask. If an input is zero then
8177 // we can use any lane.
8178 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8179 unsigned BlendMask = 0;
8180 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8186 if (M == i + Size) {
8187 BlendMask |= 1u << i;
8198 BlendMask |= 1u << i;
8203 return SDValue(); // Shuffled input!
8206 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8208 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8210 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8212 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
8213 unsigned ScaledMask = 0;
8214 for (int i = 0; i != Size; ++i)
8215 if (BlendMask & (1u << i))
8216 for (int j = 0; j != Scale; ++j)
8217 ScaledMask |= 1u << (i * Scale + j);
8221 switch (VT.SimpleTy) {
8226 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8227 DAG.getConstant(BlendMask, DL, MVT::i8));
8231 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8235 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8236 // that instruction.
8237 if (Subtarget.hasAVX2()) {
8238 // Scale the blend by the number of 32-bit dwords per element.
8239 int Scale = VT.getScalarSizeInBits() / 32;
8240 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8241 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8242 V1 = DAG.getBitcast(BlendVT, V1);
8243 V2 = DAG.getBitcast(BlendVT, V2);
8244 return DAG.getBitcast(
8245 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8246 DAG.getConstant(BlendMask, DL, MVT::i8)));
8250 // For integer shuffles we need to expand the mask and cast the inputs to
8251 // v8i16s prior to blending.
8252 int Scale = 8 / VT.getVectorNumElements();
8253 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8254 V1 = DAG.getBitcast(MVT::v8i16, V1);
8255 V2 = DAG.getBitcast(MVT::v8i16, V2);
8256 return DAG.getBitcast(VT,
8257 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8258 DAG.getConstant(BlendMask, DL, MVT::i8)));
8262 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8263 SmallVector<int, 8> RepeatedMask;
8264 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8265 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8266 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8268 for (int i = 0; i < 8; ++i)
8269 if (RepeatedMask[i] >= 8)
8270 BlendMask |= 1u << i;
8271 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8272 DAG.getConstant(BlendMask, DL, MVT::i8));
8278 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8279 "256-bit byte-blends require AVX2 support!");
8281 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8282 if (SDValue Masked =
8283 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8286 // Scale the blend by the number of bytes per element.
8287 int Scale = VT.getScalarSizeInBits() / 8;
8289 // This form of blend is always done on bytes. Compute the byte vector
8291 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8293 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8294 // mix of LLVM's code generator and the x86 backend. We tell the code
8295 // generator that boolean values in the elements of an x86 vector register
8296 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8297 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8298 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8299 // of the element (the remaining are ignored) and 0 in that high bit would
8300 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8301 // the LLVM model for boolean values in vector elements gets the relevant
8302 // bit set, it is set backwards and over constrained relative to x86's
8304 SmallVector<SDValue, 32> VSELECTMask;
8305 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8306 for (int j = 0; j < Scale; ++j)
8307 VSELECTMask.push_back(
8308 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8309 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8312 V1 = DAG.getBitcast(BlendVT, V1);
8313 V2 = DAG.getBitcast(BlendVT, V2);
8314 return DAG.getBitcast(
8315 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8316 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8320 llvm_unreachable("Not a supported integer vector type!");
8324 /// \brief Try to lower as a blend of elements from two inputs followed by
8325 /// a single-input permutation.
8327 /// This matches the pattern where we can blend elements from two inputs and
8328 /// then reduce the shuffle to a single-input permutation.
8329 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8330 SDValue V1, SDValue V2,
8332 SelectionDAG &DAG) {
8333 // We build up the blend mask while checking whether a blend is a viable way
8334 // to reduce the shuffle.
8335 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8336 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8338 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8342 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8344 if (BlendMask[Mask[i] % Size] < 0)
8345 BlendMask[Mask[i] % Size] = Mask[i];
8346 else if (BlendMask[Mask[i] % Size] != Mask[i])
8347 return SDValue(); // Can't blend in the needed input!
8349 PermuteMask[i] = Mask[i] % Size;
8352 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8353 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8356 /// \brief Generic routine to decompose a shuffle and blend into indepndent
8357 /// blends and permutes.
8359 /// This matches the extremely common pattern for handling combined
8360 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8361 /// operations. It will try to pick the best arrangement of shuffles and
8363 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8367 SelectionDAG &DAG) {
8368 // Shuffle the input elements into the desired positions in V1 and V2 and
8369 // blend them together.
8370 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8371 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8372 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8373 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8374 if (Mask[i] >= 0 && Mask[i] < Size) {
8375 V1Mask[i] = Mask[i];
8377 } else if (Mask[i] >= Size) {
8378 V2Mask[i] = Mask[i] - Size;
8379 BlendMask[i] = i + Size;
8382 // Try to lower with the simpler initial blend strategy unless one of the
8383 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8384 // shuffle may be able to fold with a load or other benefit. However, when
8385 // we'll have to do 2x as many shuffles in order to achieve this, blending
8386 // first is a better strategy.
8387 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8388 if (SDValue BlendPerm =
8389 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8392 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8393 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8394 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8397 /// \brief Try to lower a vector shuffle as a rotation.
8399 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8400 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8401 ArrayRef<int> Mask) {
8402 int NumElts = Mask.size();
8404 // We need to detect various ways of spelling a rotation:
8405 // [11, 12, 13, 14, 15, 0, 1, 2]
8406 // [-1, 12, 13, 14, -1, -1, 1, -1]
8407 // [-1, -1, -1, -1, -1, -1, 1, 2]
8408 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8409 // [-1, 4, 5, 6, -1, -1, 9, -1]
8410 // [-1, 4, 5, 6, -1, -1, -1, -1]
8413 for (int i = 0; i < NumElts; ++i) {
8415 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8416 "Unexpected mask index.");
8420 // Determine where a rotated vector would have started.
8421 int StartIdx = i - (M % NumElts);
8423 // The identity rotation isn't interesting, stop.
8426 // If we found the tail of a vector the rotation must be the missing
8427 // front. If we found the head of a vector, it must be how much of the
8429 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8432 Rotation = CandidateRotation;
8433 else if (Rotation != CandidateRotation)
8434 // The rotations don't match, so we can't match this mask.
8437 // Compute which value this mask is pointing at.
8438 SDValue MaskV = M < NumElts ? V1 : V2;
8440 // Compute which of the two target values this index should be assigned
8441 // to. This reflects whether the high elements are remaining or the low
8442 // elements are remaining.
8443 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8445 // Either set up this value if we've not encountered it before, or check
8446 // that it remains consistent.
8449 else if (TargetV != MaskV)
8450 // This may be a rotation, but it pulls from the inputs in some
8451 // unsupported interleaving.
8455 // Check that we successfully analyzed the mask, and normalize the results.
8456 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8457 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8469 /// \brief Try to lower a vector shuffle as a byte rotation.
8471 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8472 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8473 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8474 /// try to generically lower a vector shuffle through such an pattern. It
8475 /// does not check for the profitability of lowering either as PALIGNR or
8476 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8477 /// This matches shuffle vectors that look like:
8479 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8481 /// Essentially it concatenates V1 and V2, shifts right by some number of
8482 /// elements, and takes the low elements as the result. Note that while this is
8483 /// specified as a *right shift* because x86 is little-endian, it is a *left
8484 /// rotate* of the vector lanes.
8485 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8486 ArrayRef<int> Mask) {
8487 // Don't accept any shuffles with zero elements.
8488 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8491 // PALIGNR works on 128-bit lanes.
8492 SmallVector<int, 16> RepeatedMask;
8493 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8496 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8500 // PALIGNR rotates bytes, so we need to scale the
8501 // rotation based on how many bytes are in the vector lane.
8502 int NumElts = RepeatedMask.size();
8503 int Scale = 16 / NumElts;
8504 return Rotation * Scale;
8507 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8508 SDValue V1, SDValue V2,
8510 const X86Subtarget &Subtarget,
8511 SelectionDAG &DAG) {
8512 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8514 SDValue Lo = V1, Hi = V2;
8515 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8516 if (ByteRotation <= 0)
8519 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8521 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8522 Lo = DAG.getBitcast(ByteVT, Lo);
8523 Hi = DAG.getBitcast(ByteVT, Hi);
8525 // SSSE3 targets can use the palignr instruction.
8526 if (Subtarget.hasSSSE3()) {
8527 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8528 "512-bit PALIGNR requires BWI instructions");
8529 return DAG.getBitcast(
8530 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8531 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8534 assert(VT.is128BitVector() &&
8535 "Rotate-based lowering only supports 128-bit lowering!");
8536 assert(Mask.size() <= 16 &&
8537 "Can shuffle at most 16 bytes in a 128-bit vector!");
8538 assert(ByteVT == MVT::v16i8 &&
8539 "SSE2 rotate lowering only needed for v16i8!");
8541 // Default SSE2 implementation
8542 int LoByteShift = 16 - ByteRotation;
8543 int HiByteShift = ByteRotation;
8545 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
8546 DAG.getConstant(LoByteShift, DL, MVT::i8));
8547 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
8548 DAG.getConstant(HiByteShift, DL, MVT::i8));
8549 return DAG.getBitcast(VT,
8550 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
8553 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
8555 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
8556 /// rotation of the concatenation of two vectors; This routine will
8557 /// try to generically lower a vector shuffle through such an pattern.
8559 /// Essentially it concatenates V1 and V2, shifts right by some number of
8560 /// elements, and takes the low elements as the result. Note that while this is
8561 /// specified as a *right shift* because x86 is little-endian, it is a *left
8562 /// rotate* of the vector lanes.
8563 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
8564 SDValue V1, SDValue V2,
8566 const X86Subtarget &Subtarget,
8567 SelectionDAG &DAG) {
8568 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
8569 "Only 32-bit and 64-bit elements are supported!");
8571 // 128/256-bit vectors are only supported with VLX.
8572 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
8573 && "VLX required for 128/256-bit vectors");
8575 SDValue Lo = V1, Hi = V2;
8576 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
8580 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
8581 DAG.getConstant(Rotation, DL, MVT::i8));
8584 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
8586 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
8587 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
8588 /// matches elements from one of the input vectors shuffled to the left or
8589 /// right with zeroable elements 'shifted in'. It handles both the strictly
8590 /// bit-wise element shifts and the byte shift across an entire 128-bit double
8593 /// PSHL : (little-endian) left bit shift.
8594 /// [ zz, 0, zz, 2 ]
8595 /// [ -1, 4, zz, -1 ]
8596 /// PSRL : (little-endian) right bit shift.
8598 /// [ -1, -1, 7, zz]
8599 /// PSLLDQ : (little-endian) left byte shift
8600 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
8601 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
8602 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
8603 /// PSRLDQ : (little-endian) right byte shift
8604 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
8605 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
8606 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
8607 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
8608 unsigned ScalarSizeInBits,
8609 ArrayRef<int> Mask, int MaskOffset,
8610 const SmallBitVector &Zeroable,
8611 const X86Subtarget &Subtarget) {
8612 int Size = Mask.size();
8613 unsigned SizeInBits = Size * ScalarSizeInBits;
8615 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
8616 for (int i = 0; i < Size; i += Scale)
8617 for (int j = 0; j < Shift; ++j)
8618 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
8624 auto MatchShift = [&](int Shift, int Scale, bool Left) {
8625 for (int i = 0; i != Size; i += Scale) {
8626 unsigned Pos = Left ? i + Shift : i;
8627 unsigned Low = Left ? i : i + Shift;
8628 unsigned Len = Scale - Shift;
8629 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
8633 int ShiftEltBits = ScalarSizeInBits * Scale;
8634 bool ByteShift = ShiftEltBits > 64;
8635 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
8636 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
8637 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
8639 // Normalize the scale for byte shifts to still produce an i64 element
8641 Scale = ByteShift ? Scale / 2 : Scale;
8643 // We need to round trip through the appropriate type for the shift.
8644 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
8645 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
8646 : MVT::getVectorVT(ShiftSVT, Size / Scale);
8647 return (int)ShiftAmt;
8650 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
8651 // keep doubling the size of the integer elements up to that. We can
8652 // then shift the elements of the integer vector by whole multiples of
8653 // their width within the elements of the larger integer vector. Test each
8654 // multiple to see if we can find a match with the moved element indices
8655 // and that the shifted in elements are all zeroable.
8656 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
8657 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
8658 for (int Shift = 1; Shift != Scale; ++Shift)
8659 for (bool Left : {true, false})
8660 if (CheckZeros(Shift, Scale, Left)) {
8661 int ShiftAmt = MatchShift(Shift, Scale, Left);
8670 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
8671 SDValue V2, ArrayRef<int> Mask,
8672 const SmallBitVector &Zeroable,
8673 const X86Subtarget &Subtarget,
8674 SelectionDAG &DAG) {
8675 int Size = Mask.size();
8676 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8682 // Try to match shuffle against V1 shift.
8683 int ShiftAmt = matchVectorShuffleAsShift(
8684 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
8686 // If V1 failed, try to match shuffle against V2 shift.
8689 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
8690 Mask, Size, Zeroable, Subtarget);
8697 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
8698 "Illegal integer vector type");
8699 V = DAG.getBitcast(ShiftVT, V);
8700 V = DAG.getNode(Opcode, DL, ShiftVT, V,
8701 DAG.getConstant(ShiftAmt, DL, MVT::i8));
8702 return DAG.getBitcast(VT, V);
8705 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
8706 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
8707 SDValue V2, ArrayRef<int> Mask,
8708 const SmallBitVector &Zeroable,
8709 SelectionDAG &DAG) {
8710 int Size = Mask.size();
8711 int HalfSize = Size / 2;
8712 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8713 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
8715 // Upper half must be undefined.
8716 if (!isUndefInRange(Mask, HalfSize, HalfSize))
8719 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
8720 // Remainder of lower half result is zero and upper half is all undef.
8721 auto LowerAsEXTRQ = [&]() {
8722 // Determine the extraction length from the part of the
8723 // lower half that isn't zeroable.
8725 for (; Len > 0; --Len)
8726 if (!Zeroable[Len - 1])
8728 assert(Len > 0 && "Zeroable shuffle mask");
8730 // Attempt to match first Len sequential elements from the lower half.
8733 for (int i = 0; i != Len; ++i) {
8737 SDValue &V = (M < Size ? V1 : V2);
8740 // The extracted elements must start at a valid index and all mask
8741 // elements must be in the lower half.
8742 if (i > M || M >= HalfSize)
8745 if (Idx < 0 || (Src == V && Idx == (M - i))) {
8756 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
8757 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8758 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8759 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
8760 DAG.getConstant(BitLen, DL, MVT::i8),
8761 DAG.getConstant(BitIdx, DL, MVT::i8));
8764 if (SDValue ExtrQ = LowerAsEXTRQ())
8767 // INSERTQ: Extract lowest Len elements from lower half of second source and
8768 // insert over first source, starting at Idx.
8769 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
8770 auto LowerAsInsertQ = [&]() {
8771 for (int Idx = 0; Idx != HalfSize; ++Idx) {
8774 // Attempt to match first source from mask before insertion point.
8775 if (isUndefInRange(Mask, 0, Idx)) {
8777 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
8779 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
8785 // Extend the extraction length looking to match both the insertion of
8786 // the second source and the remaining elements of the first.
8787 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8792 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8794 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8800 // Match the remaining elements of the lower half.
8801 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8803 } else if ((!Base || (Base == V1)) &&
8804 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8806 } else if ((!Base || (Base == V2)) &&
8807 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8814 // We may not have a base (first source) - this can safely be undefined.
8816 Base = DAG.getUNDEF(VT);
8818 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8819 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8820 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8821 DAG.getConstant(BitLen, DL, MVT::i8),
8822 DAG.getConstant(BitIdx, DL, MVT::i8));
8829 if (SDValue InsertQ = LowerAsInsertQ())
8835 /// \brief Lower a vector shuffle as a zero or any extension.
8837 /// Given a specific number of elements, element bit width, and extension
8838 /// stride, produce either a zero or any extension based on the available
8839 /// features of the subtarget. The extended elements are consecutive and
8840 /// begin and can start from an offseted element index in the input; to
8841 /// avoid excess shuffling the offset must either being in the bottom lane
8842 /// or at the start of a higher lane. All extended elements must be from
8844 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8845 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8846 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8847 assert(Scale > 1 && "Need a scale to extend.");
8848 int EltBits = VT.getScalarSizeInBits();
8849 int NumElements = VT.getVectorNumElements();
8850 int NumEltsPerLane = 128 / EltBits;
8851 int OffsetLane = Offset / NumEltsPerLane;
8852 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8853 "Only 8, 16, and 32 bit elements can be extended.");
8854 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8855 assert(0 <= Offset && "Extension offset must be positive.");
8856 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8857 "Extension offset must be in the first lane or start an upper lane.");
8859 // Check that an index is in same lane as the base offset.
8860 auto SafeOffset = [&](int Idx) {
8861 return OffsetLane == (Idx / NumEltsPerLane);
8864 // Shift along an input so that the offset base moves to the first element.
8865 auto ShuffleOffset = [&](SDValue V) {
8869 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8870 for (int i = 0; i * Scale < NumElements; ++i) {
8871 int SrcIdx = i + Offset;
8872 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8874 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8877 // Found a valid zext mask! Try various lowering strategies based on the
8878 // input type and available ISA extensions.
8879 if (Subtarget.hasSSE41()) {
8880 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8881 // PUNPCK will catch this in a later shuffle match.
8882 if (Offset && Scale == 2 && VT.is128BitVector())
8884 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8885 NumElements / Scale);
8886 InputV = ShuffleOffset(InputV);
8888 // For 256-bit vectors, we only need the lower (128-bit) input half.
8889 // For 512-bit vectors, we only need the lower input half or quarter.
8890 if (VT.getSizeInBits() > 128)
8891 InputV = extractSubVector(InputV, 0, DAG, DL,
8892 std::max(128, (int)VT.getSizeInBits() / Scale));
8894 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8895 return DAG.getBitcast(VT, InputV);
8898 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8900 // For any extends we can cheat for larger element sizes and use shuffle
8901 // instructions that can fold with a load and/or copy.
8902 if (AnyExt && EltBits == 32) {
8903 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8905 return DAG.getBitcast(
8906 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8907 DAG.getBitcast(MVT::v4i32, InputV),
8908 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8910 if (AnyExt && EltBits == 16 && Scale > 2) {
8911 int PSHUFDMask[4] = {Offset / 2, -1,
8912 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8913 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8914 DAG.getBitcast(MVT::v4i32, InputV),
8915 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8916 int PSHUFWMask[4] = {1, -1, -1, -1};
8917 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8918 return DAG.getBitcast(
8919 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8920 DAG.getBitcast(MVT::v8i16, InputV),
8921 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8924 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8926 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8927 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8928 assert(VT.is128BitVector() && "Unexpected vector width!");
8930 int LoIdx = Offset * EltBits;
8931 SDValue Lo = DAG.getBitcast(
8932 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8933 DAG.getConstant(EltBits, DL, MVT::i8),
8934 DAG.getConstant(LoIdx, DL, MVT::i8)));
8936 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8937 !SafeOffset(Offset + 1))
8938 return DAG.getBitcast(VT, Lo);
8940 int HiIdx = (Offset + 1) * EltBits;
8941 SDValue Hi = DAG.getBitcast(
8942 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8943 DAG.getConstant(EltBits, DL, MVT::i8),
8944 DAG.getConstant(HiIdx, DL, MVT::i8)));
8945 return DAG.getBitcast(VT,
8946 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8949 // If this would require more than 2 unpack instructions to expand, use
8950 // pshufb when available. We can only use more than 2 unpack instructions
8951 // when zero extending i8 elements which also makes it easier to use pshufb.
8952 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8953 assert(NumElements == 16 && "Unexpected byte vector width!");
8954 SDValue PSHUFBMask[16];
8955 for (int i = 0; i < 16; ++i) {
8956 int Idx = Offset + (i / Scale);
8957 PSHUFBMask[i] = DAG.getConstant(
8958 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8960 InputV = DAG.getBitcast(MVT::v16i8, InputV);
8961 return DAG.getBitcast(
8962 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8963 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8966 // If we are extending from an offset, ensure we start on a boundary that
8967 // we can unpack from.
8968 int AlignToUnpack = Offset % (NumElements / Scale);
8969 if (AlignToUnpack) {
8970 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8971 for (int i = AlignToUnpack; i < NumElements; ++i)
8972 ShMask[i - AlignToUnpack] = i;
8973 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8974 Offset -= AlignToUnpack;
8977 // Otherwise emit a sequence of unpacks.
8979 unsigned UnpackLoHi = X86ISD::UNPCKL;
8980 if (Offset >= (NumElements / 2)) {
8981 UnpackLoHi = X86ISD::UNPCKH;
8982 Offset -= (NumElements / 2);
8985 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8986 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8987 : getZeroVector(InputVT, Subtarget, DAG, DL);
8988 InputV = DAG.getBitcast(InputVT, InputV);
8989 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8993 } while (Scale > 1);
8994 return DAG.getBitcast(VT, InputV);
8997 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8999 /// This routine will try to do everything in its power to cleverly lower
9000 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9001 /// check for the profitability of this lowering, it tries to aggressively
9002 /// match this pattern. It will use all of the micro-architectural details it
9003 /// can to emit an efficient lowering. It handles both blends with all-zero
9004 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9005 /// masking out later).
9007 /// The reason we have dedicated lowering for zext-style shuffles is that they
9008 /// are both incredibly common and often quite performance sensitive.
9009 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9010 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9011 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9012 SelectionDAG &DAG) {
9013 int Bits = VT.getSizeInBits();
9014 int NumLanes = Bits / 128;
9015 int NumElements = VT.getVectorNumElements();
9016 int NumEltsPerLane = NumElements / NumLanes;
9017 assert(VT.getScalarSizeInBits() <= 32 &&
9018 "Exceeds 32-bit integer zero extension limit");
9019 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9021 // Define a helper function to check a particular ext-scale and lower to it if
9023 auto Lower = [&](int Scale) -> SDValue {
9028 for (int i = 0; i < NumElements; ++i) {
9031 continue; // Valid anywhere but doesn't tell us anything.
9032 if (i % Scale != 0) {
9033 // Each of the extended elements need to be zeroable.
9037 // We no longer are in the anyext case.
9042 // Each of the base elements needs to be consecutive indices into the
9043 // same input vector.
9044 SDValue V = M < NumElements ? V1 : V2;
9045 M = M % NumElements;
9048 Offset = M - (i / Scale);
9049 } else if (InputV != V)
9050 return SDValue(); // Flip-flopping inputs.
9052 // Offset must start in the lowest 128-bit lane or at the start of an
9054 // FIXME: Is it ever worth allowing a negative base offset?
9055 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9056 (Offset % NumEltsPerLane) == 0))
9059 // If we are offsetting, all referenced entries must come from the same
9061 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9064 if ((M % NumElements) != (Offset + (i / Scale)))
9065 return SDValue(); // Non-consecutive strided elements.
9069 // If we fail to find an input, we have a zero-shuffle which should always
9070 // have already been handled.
9071 // FIXME: Maybe handle this here in case during blending we end up with one?
9075 // If we are offsetting, don't extend if we only match a single input, we
9076 // can always do better by using a basic PSHUF or PUNPCK.
9077 if (Offset != 0 && Matches < 2)
9080 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9081 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9084 // The widest scale possible for extending is to a 64-bit integer.
9085 assert(Bits % 64 == 0 &&
9086 "The number of bits in a vector must be divisible by 64 on x86!");
9087 int NumExtElements = Bits / 64;
9089 // Each iteration, try extending the elements half as much, but into twice as
9091 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9092 assert(NumElements % NumExtElements == 0 &&
9093 "The input vector size must be divisible by the extended size.");
9094 if (SDValue V = Lower(NumElements / NumExtElements))
9098 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9102 // Returns one of the source operands if the shuffle can be reduced to a
9103 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9104 auto CanZExtLowHalf = [&]() {
9105 for (int i = NumElements / 2; i != NumElements; ++i)
9108 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9110 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9115 if (SDValue V = CanZExtLowHalf()) {
9116 V = DAG.getBitcast(MVT::v2i64, V);
9117 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9118 return DAG.getBitcast(VT, V);
9121 // No viable ext lowering found.
9125 /// \brief Try to get a scalar value for a specific element of a vector.
9127 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9128 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9129 SelectionDAG &DAG) {
9130 MVT VT = V.getSimpleValueType();
9131 MVT EltVT = VT.getVectorElementType();
9132 V = peekThroughBitcasts(V);
9134 // If the bitcasts shift the element size, we can't extract an equivalent
9136 MVT NewVT = V.getSimpleValueType();
9137 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9140 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9141 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9142 // Ensure the scalar operand is the same size as the destination.
9143 // FIXME: Add support for scalar truncation where possible.
9144 SDValue S = V.getOperand(Idx);
9145 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9146 return DAG.getBitcast(EltVT, S);
9152 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9154 /// This is particularly important because the set of instructions varies
9155 /// significantly based on whether the operand is a load or not.
9156 static bool isShuffleFoldableLoad(SDValue V) {
9157 V = peekThroughBitcasts(V);
9158 return ISD::isNON_EXTLoad(V.getNode());
9161 /// \brief Try to lower insertion of a single element into a zero vector.
9163 /// This is a common pattern that we have especially efficient patterns to lower
9164 /// across all subtarget feature sets.
9165 static SDValue lowerVectorShuffleAsElementInsertion(
9166 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9167 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9168 SelectionDAG &DAG) {
9170 MVT EltVT = VT.getVectorElementType();
9173 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9175 bool IsV1Zeroable = true;
9176 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9177 if (i != V2Index && !Zeroable[i]) {
9178 IsV1Zeroable = false;
9182 // Check for a single input from a SCALAR_TO_VECTOR node.
9183 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9184 // all the smarts here sunk into that routine. However, the current
9185 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9186 // vector shuffle lowering is dead.
9187 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9189 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9190 // We need to zext the scalar if it is smaller than an i32.
9191 V2S = DAG.getBitcast(EltVT, V2S);
9192 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9193 // Using zext to expand a narrow element won't work for non-zero
9198 // Zero-extend directly to i32.
9200 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9202 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9203 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9204 EltVT == MVT::i16) {
9205 // Either not inserting from the low element of the input or the input
9206 // element size is too small to use VZEXT_MOVL to clear the high bits.
9210 if (!IsV1Zeroable) {
9211 // If V1 can't be treated as a zero vector we have fewer options to lower
9212 // this. We can't support integer vectors or non-zero targets cheaply, and
9213 // the V1 elements can't be permuted in any way.
9214 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9215 if (!VT.isFloatingPoint() || V2Index != 0)
9217 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9218 V1Mask[V2Index] = -1;
9219 if (!isNoopShuffleMask(V1Mask))
9221 // This is essentially a special case blend operation, but if we have
9222 // general purpose blend operations, they are always faster. Bail and let
9223 // the rest of the lowering handle these as blends.
9224 if (Subtarget.hasSSE41())
9227 // Otherwise, use MOVSD or MOVSS.
9228 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9229 "Only two types of floating point element types to handle!");
9230 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9234 // This lowering only works for the low element with floating point vectors.
9235 if (VT.isFloatingPoint() && V2Index != 0)
9238 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9240 V2 = DAG.getBitcast(VT, V2);
9243 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9244 // the desired position. Otherwise it is more efficient to do a vector
9245 // shift left. We know that we can do a vector shift left because all
9246 // the inputs are zero.
9247 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9248 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9249 V2Shuffle[V2Index] = 0;
9250 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9252 V2 = DAG.getBitcast(MVT::v16i8, V2);
9254 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9255 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9256 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9257 DAG.getDataLayout(), VT)));
9258 V2 = DAG.getBitcast(VT, V2);
9264 /// Try to lower broadcast of a single - truncated - integer element,
9265 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9267 /// This assumes we have AVX2.
9268 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9269 SDValue V0, int BroadcastIdx,
9270 const X86Subtarget &Subtarget,
9271 SelectionDAG &DAG) {
9272 assert(Subtarget.hasAVX2() &&
9273 "We can only lower integer broadcasts with AVX2!");
9275 EVT EltVT = VT.getVectorElementType();
9276 EVT V0VT = V0.getValueType();
9278 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9279 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9281 EVT V0EltVT = V0VT.getVectorElementType();
9282 if (!V0EltVT.isInteger())
9285 const unsigned EltSize = EltVT.getSizeInBits();
9286 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9288 // This is only a truncation if the original element type is larger.
9289 if (V0EltSize <= EltSize)
9292 assert(((V0EltSize % EltSize) == 0) &&
9293 "Scalar type sizes must all be powers of 2 on x86!");
9295 const unsigned V0Opc = V0.getOpcode();
9296 const unsigned Scale = V0EltSize / EltSize;
9297 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9299 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9300 V0Opc != ISD::BUILD_VECTOR)
9303 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9305 // If we're extracting non-least-significant bits, shift so we can truncate.
9306 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9307 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9308 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9309 if (const int OffsetIdx = BroadcastIdx % Scale)
9310 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9311 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9313 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9314 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9317 /// \brief Try to lower broadcast of a single element.
9319 /// For convenience, this code also bundles all of the subtarget feature set
9320 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9321 /// a convenient way to factor it out.
9322 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9323 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9324 SDValue V1, SDValue V2,
9326 const X86Subtarget &Subtarget,
9327 SelectionDAG &DAG) {
9328 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9329 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9330 (Subtarget.hasAVX2() && VT.isInteger())))
9333 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9334 // we can only broadcast from a register with AVX2.
9335 unsigned NumElts = Mask.size();
9336 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9337 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9339 // Check that the mask is a broadcast.
9340 int BroadcastIdx = -1;
9341 for (int i = 0; i != (int)NumElts; ++i) {
9342 SmallVector<int, 8> BroadcastMask(NumElts, i);
9343 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9349 if (BroadcastIdx < 0)
9351 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9352 "a sorted mask where the broadcast "
9355 // Go up the chain of (vector) values to find a scalar load that we can
9356 // combine with the broadcast.
9359 switch (V.getOpcode()) {
9360 case ISD::BITCAST: {
9361 SDValue VSrc = V.getOperand(0);
9362 MVT SrcVT = VSrc.getSimpleValueType();
9363 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9368 case ISD::CONCAT_VECTORS: {
9369 int OperandSize = Mask.size() / V.getNumOperands();
9370 V = V.getOperand(BroadcastIdx / OperandSize);
9371 BroadcastIdx %= OperandSize;
9374 case ISD::INSERT_SUBVECTOR: {
9375 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9376 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9380 int BeginIdx = (int)ConstantIdx->getZExtValue();
9382 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9383 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9384 BroadcastIdx -= BeginIdx;
9395 // Check if this is a broadcast of a scalar. We special case lowering
9396 // for scalars so that we can more effectively fold with loads.
9397 // First, look through bitcast: if the original value has a larger element
9398 // type than the shuffle, the broadcast element is in essence truncated.
9399 // Make that explicit to ease folding.
9400 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9401 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9402 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9403 return TruncBroadcast;
9405 MVT BroadcastVT = VT;
9407 // Peek through any bitcast (only useful for loads).
9408 SDValue BC = peekThroughBitcasts(V);
9410 // Also check the simpler case, where we can directly reuse the scalar.
9411 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9412 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9413 V = V.getOperand(BroadcastIdx);
9415 // If we can't broadcast from a register, check that the input is a load.
9416 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9418 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9419 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9420 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9421 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9422 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9425 // If we are broadcasting a load that is only used by the shuffle
9426 // then we can reduce the vector load to the broadcasted scalar load.
9427 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9428 SDValue BaseAddr = Ld->getOperand(1);
9429 EVT SVT = BroadcastVT.getScalarType();
9430 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9431 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9432 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9433 DAG.getMachineFunction().getMachineMemOperand(
9434 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9436 // Make sure the newly-created LOAD is in the same position as Ld in
9437 // terms of dependency. We create a TokenFactor for Ld and V,
9438 // and update uses of Ld's output chain to use the TokenFactor.
9439 if (Ld->hasAnyUseOfValue(1)) {
9440 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9441 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9442 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9443 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9444 SDValue(V.getNode(), 1));
9446 } else if (!BroadcastFromReg) {
9447 // We can't broadcast from a vector register.
9449 } else if (BroadcastIdx != 0) {
9450 // We can only broadcast from the zero-element of a vector register,
9451 // but it can be advantageous to broadcast from the zero-element of a
9453 if (!VT.is256BitVector() && !VT.is512BitVector())
9456 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9457 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9460 // Only broadcast the zero-element of a 128-bit subvector.
9461 unsigned EltSize = VT.getScalarSizeInBits();
9462 if (((BroadcastIdx * EltSize) % 128) != 0)
9465 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
9466 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9467 DAG.getIntPtrConstant(BroadcastIdx, DL));
9470 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9471 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9472 DAG.getBitcast(MVT::f64, V));
9474 // Bitcast back to the same scalar type as BroadcastVT.
9475 MVT SrcVT = V.getSimpleValueType();
9476 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9477 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9478 "Unexpected vector element size");
9479 if (SrcVT.isVector()) {
9480 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9481 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9483 SrcVT = BroadcastVT.getScalarType();
9485 V = DAG.getBitcast(SrcVT, V);
9488 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9489 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9490 V = DAG.getBitcast(MVT::f64, V);
9491 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9492 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9495 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9498 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9499 // INSERTPS when the V1 elements are already in the correct locations
9500 // because otherwise we can just always use two SHUFPS instructions which
9501 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9502 // perform INSERTPS if a single V1 element is out of place and all V2
9503 // elements are zeroable.
9504 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9505 unsigned &InsertPSMask,
9506 const SmallBitVector &Zeroable,
9508 SelectionDAG &DAG) {
9509 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9510 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9511 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9513 // Attempt to match INSERTPS with one element from VA or VB being
9514 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9516 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9517 ArrayRef<int> CandidateMask) {
9519 int VADstIndex = -1;
9520 int VBDstIndex = -1;
9521 bool VAUsedInPlace = false;
9523 for (int i = 0; i < 4; ++i) {
9524 // Synthesize a zero mask from the zeroable elements (includes undefs).
9530 // Flag if we use any VA inputs in place.
9531 if (i == CandidateMask[i]) {
9532 VAUsedInPlace = true;
9536 // We can only insert a single non-zeroable element.
9537 if (VADstIndex >= 0 || VBDstIndex >= 0)
9540 if (CandidateMask[i] < 4) {
9541 // VA input out of place for insertion.
9544 // VB input for insertion.
9549 // Don't bother if we have no (non-zeroable) element for insertion.
9550 if (VADstIndex < 0 && VBDstIndex < 0)
9553 // Determine element insertion src/dst indices. The src index is from the
9554 // start of the inserted vector, not the start of the concatenated vector.
9555 unsigned VBSrcIndex = 0;
9556 if (VADstIndex >= 0) {
9557 // If we have a VA input out of place, we use VA as the V2 element
9558 // insertion and don't use the original V2 at all.
9559 VBSrcIndex = CandidateMask[VADstIndex];
9560 VBDstIndex = VADstIndex;
9563 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
9566 // If no V1 inputs are used in place, then the result is created only from
9567 // the zero mask and the V2 insertion - so remove V1 dependency.
9569 VA = DAG.getUNDEF(MVT::v4f32);
9571 // Update V1, V2 and InsertPSMask accordingly.
9575 // Insert the V2 element into the desired position.
9576 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
9577 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
9581 if (matchAsInsertPS(V1, V2, Mask))
9584 // Commute and try again.
9585 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
9586 ShuffleVectorSDNode::commuteMask(CommutedMask);
9587 if (matchAsInsertPS(V2, V1, CommutedMask))
9593 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
9594 SDValue V2, ArrayRef<int> Mask,
9595 const SmallBitVector &Zeroable,
9596 SelectionDAG &DAG) {
9597 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9598 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9600 // Attempt to match the insertps pattern.
9601 unsigned InsertPSMask;
9602 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
9605 // Insert the V2 element into the desired position.
9606 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9607 DAG.getConstant(InsertPSMask, DL, MVT::i8));
9610 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
9611 /// UNPCK instruction.
9613 /// This specifically targets cases where we end up with alternating between
9614 /// the two inputs, and so can permute them into something that feeds a single
9615 /// UNPCK instruction. Note that this routine only targets integer vectors
9616 /// because for floating point vectors we have a generalized SHUFPS lowering
9617 /// strategy that handles everything that doesn't *exactly* match an unpack,
9618 /// making this clever lowering unnecessary.
9619 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
9620 SDValue V1, SDValue V2,
9622 SelectionDAG &DAG) {
9623 assert(!VT.isFloatingPoint() &&
9624 "This routine only supports integer vectors.");
9625 assert(VT.is128BitVector() &&
9626 "This routine only works on 128-bit vectors.");
9627 assert(!V2.isUndef() &&
9628 "This routine should only be used when blending two inputs.");
9629 assert(Mask.size() >= 2 && "Single element masks are invalid.");
9631 int Size = Mask.size();
9634 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
9636 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
9638 bool UnpackLo = NumLoInputs >= NumHiInputs;
9640 auto TryUnpack = [&](int ScalarSize, int Scale) {
9641 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
9642 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
9644 for (int i = 0; i < Size; ++i) {
9648 // Each element of the unpack contains Scale elements from this mask.
9649 int UnpackIdx = i / Scale;
9651 // We only handle the case where V1 feeds the first slots of the unpack.
9652 // We rely on canonicalization to ensure this is the case.
9653 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
9656 // Setup the mask for this input. The indexing is tricky as we have to
9657 // handle the unpack stride.
9658 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
9659 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
9663 // If we will have to shuffle both inputs to use the unpack, check whether
9664 // we can just unpack first and shuffle the result. If so, skip this unpack.
9665 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
9666 !isNoopShuffleMask(V2Mask))
9669 // Shuffle the inputs into place.
9670 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9671 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9673 // Cast the inputs to the type we will use to unpack them.
9674 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
9675 V1 = DAG.getBitcast(UnpackVT, V1);
9676 V2 = DAG.getBitcast(UnpackVT, V2);
9678 // Unpack the inputs and cast the result back to the desired type.
9679 return DAG.getBitcast(
9680 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9684 // We try each unpack from the largest to the smallest to try and find one
9685 // that fits this mask.
9686 int OrigScalarSize = VT.getScalarSizeInBits();
9687 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
9688 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
9691 // If none of the unpack-rooted lowerings worked (or were profitable) try an
9693 if (NumLoInputs == 0 || NumHiInputs == 0) {
9694 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
9695 "We have to have *some* inputs!");
9696 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
9698 // FIXME: We could consider the total complexity of the permute of each
9699 // possible unpacking. Or at the least we should consider how many
9700 // half-crossings are created.
9701 // FIXME: We could consider commuting the unpacks.
9703 SmallVector<int, 32> PermMask((unsigned)Size, -1);
9704 for (int i = 0; i < Size; ++i) {
9708 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
9711 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
9713 return DAG.getVectorShuffle(
9714 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
9716 DAG.getUNDEF(VT), PermMask);
9722 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
9724 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
9725 /// support for floating point shuffles but not integer shuffles. These
9726 /// instructions will incur a domain crossing penalty on some chips though so
9727 /// it is better to avoid lowering through this for integer vectors where
9729 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9730 const SmallBitVector &Zeroable,
9731 SDValue V1, SDValue V2,
9732 const X86Subtarget &Subtarget,
9733 SelectionDAG &DAG) {
9734 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9735 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9736 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9739 // Check for being able to broadcast a single element.
9740 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9741 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
9744 // Straight shuffle of a single input vector. Simulate this by using the
9745 // single input as both of the "inputs" to this instruction..
9746 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
9748 if (Subtarget.hasAVX()) {
9749 // If we have AVX, we can use VPERMILPS which will allow folding a load
9750 // into the shuffle.
9751 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
9752 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9756 X86ISD::SHUFP, DL, MVT::v2f64,
9757 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9758 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9759 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9761 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
9762 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
9764 // If we have a single input, insert that into V1 if we can do so cheaply.
9765 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
9766 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9767 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9769 // Try inverting the insertion since for v2 masks it is easy to do and we
9770 // can't reliably sort the mask one way or the other.
9771 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
9772 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
9773 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9774 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9778 // Try to use one of the special instruction patterns to handle two common
9779 // blend patterns if a zero-blend above didn't work.
9780 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
9781 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
9782 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
9783 // We can either use a special instruction to load over the low double or
9784 // to move just the low double.
9786 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
9788 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
9790 if (Subtarget.hasSSE41())
9791 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
9792 Zeroable, Subtarget, DAG))
9795 // Use dedicated unpack instructions for masks that match their pattern.
9797 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
9800 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
9801 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
9802 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9805 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
9807 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
9808 /// the integer unit to minimize domain crossing penalties. However, for blends
9809 /// it falls back to the floating point shuffle operation with appropriate bit
9811 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9812 const SmallBitVector &Zeroable,
9813 SDValue V1, SDValue V2,
9814 const X86Subtarget &Subtarget,
9815 SelectionDAG &DAG) {
9816 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9817 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9818 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9821 // Check for being able to broadcast a single element.
9822 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9823 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9826 // Straight shuffle of a single input vector. For everything from SSE2
9827 // onward this has a single fast instruction with no scary immediates.
9828 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9829 V1 = DAG.getBitcast(MVT::v4i32, V1);
9830 int WidenedMask[4] = {
9831 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9832 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9833 return DAG.getBitcast(
9835 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9836 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9838 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9839 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9840 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9841 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9843 // If we have a blend of two same-type PACKUS operations and the blend aligns
9844 // with the low and high halves, we can just merge the PACKUS operations.
9845 // This is particularly important as it lets us merge shuffles that this
9846 // routine itself creates.
9847 auto GetPackNode = [](SDValue V) {
9848 V = peekThroughBitcasts(V);
9849 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9851 if (SDValue V1Pack = GetPackNode(V1))
9852 if (SDValue V2Pack = GetPackNode(V2)) {
9853 EVT PackVT = V1Pack.getValueType();
9854 if (PackVT == V2Pack.getValueType())
9855 return DAG.getBitcast(MVT::v2i64,
9856 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9857 Mask[0] == 0 ? V1Pack.getOperand(0)
9858 : V1Pack.getOperand(1),
9859 Mask[1] == 2 ? V2Pack.getOperand(0)
9860 : V2Pack.getOperand(1)));
9863 // Try to use shift instructions.
9864 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9865 Zeroable, Subtarget, DAG))
9868 // When loading a scalar and then shuffling it into a vector we can often do
9869 // the insertion cheaply.
9870 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9871 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9873 // Try inverting the insertion since for v2 masks it is easy to do and we
9874 // can't reliably sort the mask one way or the other.
9875 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9876 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9877 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9880 // We have different paths for blend lowering, but they all must use the
9881 // *exact* same predicate.
9882 bool IsBlendSupported = Subtarget.hasSSE41();
9883 if (IsBlendSupported)
9884 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9885 Zeroable, Subtarget, DAG))
9888 // Use dedicated unpack instructions for masks that match their pattern.
9890 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9893 // Try to use byte rotation instructions.
9894 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9895 if (Subtarget.hasSSSE3())
9896 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9897 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9900 // If we have direct support for blends, we should lower by decomposing into
9901 // a permute. That will be faster than the domain cross.
9902 if (IsBlendSupported)
9903 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9906 // We implement this with SHUFPD which is pretty lame because it will likely
9907 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9908 // However, all the alternatives are still more cycles and newer chips don't
9909 // have this problem. It would be really nice if x86 had better shuffles here.
9910 V1 = DAG.getBitcast(MVT::v2f64, V1);
9911 V2 = DAG.getBitcast(MVT::v2f64, V2);
9912 return DAG.getBitcast(MVT::v2i64,
9913 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9916 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9918 /// This is used to disable more specialized lowerings when the shufps lowering
9919 /// will happen to be efficient.
9920 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9921 // This routine only handles 128-bit shufps.
9922 assert(Mask.size() == 4 && "Unsupported mask size!");
9923 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9924 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9925 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9926 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9928 // To lower with a single SHUFPS we need to have the low half and high half
9929 // each requiring a single input.
9930 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9932 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9938 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9940 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9941 /// It makes no assumptions about whether this is the *best* lowering, it simply
9943 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9944 ArrayRef<int> Mask, SDValue V1,
9945 SDValue V2, SelectionDAG &DAG) {
9946 SDValue LowV = V1, HighV = V2;
9947 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9949 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9951 if (NumV2Elements == 1) {
9952 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
9954 // Compute the index adjacent to V2Index and in the same half by toggling
9956 int V2AdjIndex = V2Index ^ 1;
9958 if (Mask[V2AdjIndex] < 0) {
9959 // Handles all the cases where we have a single V2 element and an undef.
9960 // This will only ever happen in the high lanes because we commute the
9961 // vector otherwise.
9963 std::swap(LowV, HighV);
9964 NewMask[V2Index] -= 4;
9966 // Handle the case where the V2 element ends up adjacent to a V1 element.
9967 // To make this work, blend them together as the first step.
9968 int V1Index = V2AdjIndex;
9969 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9970 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9971 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9973 // Now proceed to reconstruct the final blend as we have the necessary
9974 // high or low half formed.
9981 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9982 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9984 } else if (NumV2Elements == 2) {
9985 if (Mask[0] < 4 && Mask[1] < 4) {
9986 // Handle the easy case where we have V1 in the low lanes and V2 in the
9990 } else if (Mask[2] < 4 && Mask[3] < 4) {
9991 // We also handle the reversed case because this utility may get called
9992 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9993 // arrange things in the right direction.
9999 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10000 // trying to place elements directly, just blend them and set up the final
10001 // shuffle to place them.
10003 // The first two blend mask elements are for V1, the second two are for
10005 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10006 Mask[2] < 4 ? Mask[2] : Mask[3],
10007 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10008 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10009 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10010 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10012 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10015 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10016 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10017 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10018 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10021 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10022 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10025 /// \brief Lower 4-lane 32-bit floating point shuffles.
10027 /// Uses instructions exclusively from the floating point unit to minimize
10028 /// domain crossing penalties, as these are sufficient to implement all v4f32
10030 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10031 const SmallBitVector &Zeroable,
10032 SDValue V1, SDValue V2,
10033 const X86Subtarget &Subtarget,
10034 SelectionDAG &DAG) {
10035 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10036 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10037 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10039 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10041 if (NumV2Elements == 0) {
10042 // Check for being able to broadcast a single element.
10043 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10044 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10047 // Use even/odd duplicate instructions for masks that match their pattern.
10048 if (Subtarget.hasSSE3()) {
10049 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10050 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10051 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10052 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10055 if (Subtarget.hasAVX()) {
10056 // If we have AVX, we can use VPERMILPS which will allow folding a load
10057 // into the shuffle.
10058 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10059 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10062 // Otherwise, use a straight shuffle of a single input vector. We pass the
10063 // input vector to both operands to simulate this with a SHUFPS.
10064 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10065 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10068 // There are special ways we can lower some single-element blends. However, we
10069 // have custom ways we can lower more complex single-element blends below that
10070 // we defer to if both this and BLENDPS fail to match, so restrict this to
10071 // when the V2 input is targeting element 0 of the mask -- that is the fast
10073 if (NumV2Elements == 1 && Mask[0] >= 4)
10074 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10075 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10078 if (Subtarget.hasSSE41()) {
10079 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10080 Zeroable, Subtarget, DAG))
10083 // Use INSERTPS if we can complete the shuffle efficiently.
10085 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10088 if (!isSingleSHUFPSMask(Mask))
10089 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10090 DL, MVT::v4f32, V1, V2, Mask, DAG))
10094 // Use low/high mov instructions.
10095 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10096 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10097 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10098 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10100 // Use dedicated unpack instructions for masks that match their pattern.
10102 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10105 // Otherwise fall back to a SHUFPS lowering strategy.
10106 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10109 /// \brief Lower 4-lane i32 vector shuffles.
10111 /// We try to handle these with integer-domain shuffles where we can, but for
10112 /// blends we use the floating point domain blend instructions.
10113 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10114 const SmallBitVector &Zeroable,
10115 SDValue V1, SDValue V2,
10116 const X86Subtarget &Subtarget,
10117 SelectionDAG &DAG) {
10118 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10119 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10120 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10122 // Whenever we can lower this as a zext, that instruction is strictly faster
10123 // than any alternative. It also allows us to fold memory operands into the
10124 // shuffle in many cases.
10125 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10126 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10129 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10131 if (NumV2Elements == 0) {
10132 // Check for being able to broadcast a single element.
10133 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10134 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10137 // Straight shuffle of a single input vector. For everything from SSE2
10138 // onward this has a single fast instruction with no scary immediates.
10139 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10140 // but we aren't actually going to use the UNPCK instruction because doing
10141 // so prevents folding a load into this instruction or making a copy.
10142 const int UnpackLoMask[] = {0, 0, 1, 1};
10143 const int UnpackHiMask[] = {2, 2, 3, 3};
10144 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10145 Mask = UnpackLoMask;
10146 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10147 Mask = UnpackHiMask;
10149 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10150 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10153 // Try to use shift instructions.
10154 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10155 Zeroable, Subtarget, DAG))
10158 // There are special ways we can lower some single-element blends.
10159 if (NumV2Elements == 1)
10160 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10161 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10164 // We have different paths for blend lowering, but they all must use the
10165 // *exact* same predicate.
10166 bool IsBlendSupported = Subtarget.hasSSE41();
10167 if (IsBlendSupported)
10168 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10169 Zeroable, Subtarget, DAG))
10172 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10176 // Use dedicated unpack instructions for masks that match their pattern.
10178 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10181 // Try to use byte rotation instructions.
10182 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10183 if (Subtarget.hasSSSE3())
10184 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10185 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10188 // Assume that a single SHUFPS is faster than an alternative sequence of
10189 // multiple instructions (even if the CPU has a domain penalty).
10190 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10191 if (!isSingleSHUFPSMask(Mask)) {
10192 // If we have direct support for blends, we should lower by decomposing into
10193 // a permute. That will be faster than the domain cross.
10194 if (IsBlendSupported)
10195 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10198 // Try to lower by permuting the inputs into an unpack instruction.
10199 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10200 DL, MVT::v4i32, V1, V2, Mask, DAG))
10204 // We implement this with SHUFPS because it can blend from two vectors.
10205 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10206 // up the inputs, bypassing domain shift penalties that we would encur if we
10207 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10209 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10210 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10211 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10212 return DAG.getBitcast(MVT::v4i32, ShufPS);
10215 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10216 /// shuffle lowering, and the most complex part.
10218 /// The lowering strategy is to try to form pairs of input lanes which are
10219 /// targeted at the same half of the final vector, and then use a dword shuffle
10220 /// to place them onto the right half, and finally unpack the paired lanes into
10221 /// their final position.
10223 /// The exact breakdown of how to form these dword pairs and align them on the
10224 /// correct sides is really tricky. See the comments within the function for
10225 /// more of the details.
10227 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10228 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10229 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10230 /// vector, form the analogous 128-bit 8-element Mask.
10231 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10232 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10233 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10234 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10235 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10237 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
10238 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10239 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10241 SmallVector<int, 4> LoInputs;
10242 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
10243 [](int M) { return M >= 0; });
10244 std::sort(LoInputs.begin(), LoInputs.end());
10245 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10246 SmallVector<int, 4> HiInputs;
10247 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
10248 [](int M) { return M >= 0; });
10249 std::sort(HiInputs.begin(), HiInputs.end());
10250 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10252 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10253 int NumHToL = LoInputs.size() - NumLToL;
10255 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10256 int NumHToH = HiInputs.size() - NumLToH;
10257 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10258 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10259 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10260 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10262 // If we are splatting two values from one half - one to each half, then
10263 // we can shuffle that half so each is splatted to a dword, then splat those
10264 // to their respective halves.
10265 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10267 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10268 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10269 V = DAG.getNode(ShufWOp, DL, VT, V,
10270 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10271 V = DAG.getBitcast(PSHUFDVT, V);
10272 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10273 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10274 return DAG.getBitcast(VT, V);
10277 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10278 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10279 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10280 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10282 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10283 // such inputs we can swap two of the dwords across the half mark and end up
10284 // with <=2 inputs to each half in each half. Once there, we can fall through
10285 // to the generic code below. For example:
10287 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10288 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10290 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10291 // and an existing 2-into-2 on the other half. In this case we may have to
10292 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10293 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10294 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10295 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10296 // half than the one we target for fixing) will be fixed when we re-enter this
10297 // path. We will also combine away any sequence of PSHUFD instructions that
10298 // result into a single instruction. Here is an example of the tricky case:
10300 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10301 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10303 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10305 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10306 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10308 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10309 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10311 // The result is fine to be handled by the generic logic.
10312 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10313 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10314 int AOffset, int BOffset) {
10315 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10316 "Must call this with A having 3 or 1 inputs from the A half.");
10317 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10318 "Must call this with B having 1 or 3 inputs from the B half.");
10319 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10320 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10322 bool ThreeAInputs = AToAInputs.size() == 3;
10324 // Compute the index of dword with only one word among the three inputs in
10325 // a half by taking the sum of the half with three inputs and subtracting
10326 // the sum of the actual three inputs. The difference is the remaining
10328 int ADWord, BDWord;
10329 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10330 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10331 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10332 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10333 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10334 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10335 int TripleNonInputIdx =
10336 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10337 TripleDWord = TripleNonInputIdx / 2;
10339 // We use xor with one to compute the adjacent DWord to whichever one the
10341 OneInputDWord = (OneInput / 2) ^ 1;
10343 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10344 // and BToA inputs. If there is also such a problem with the BToB and AToB
10345 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10346 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10347 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10348 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10349 // Compute how many inputs will be flipped by swapping these DWords. We
10351 // to balance this to ensure we don't form a 3-1 shuffle in the other
10353 int NumFlippedAToBInputs =
10354 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10355 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10356 int NumFlippedBToBInputs =
10357 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10358 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10359 if ((NumFlippedAToBInputs == 1 &&
10360 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10361 (NumFlippedBToBInputs == 1 &&
10362 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10363 // We choose whether to fix the A half or B half based on whether that
10364 // half has zero flipped inputs. At zero, we may not be able to fix it
10365 // with that half. We also bias towards fixing the B half because that
10366 // will more commonly be the high half, and we have to bias one way.
10367 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10368 ArrayRef<int> Inputs) {
10369 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10370 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10371 // Determine whether the free index is in the flipped dword or the
10372 // unflipped dword based on where the pinned index is. We use this bit
10373 // in an xor to conditionally select the adjacent dword.
10374 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10375 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10376 if (IsFixIdxInput == IsFixFreeIdxInput)
10378 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10379 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10380 "We need to be changing the number of flipped inputs!");
10381 int PSHUFHalfMask[] = {0, 1, 2, 3};
10382 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10383 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10385 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10387 for (int &M : Mask)
10388 if (M >= 0 && M == FixIdx)
10390 else if (M >= 0 && M == FixFreeIdx)
10393 if (NumFlippedBToBInputs != 0) {
10395 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10396 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10398 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10399 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10400 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10405 int PSHUFDMask[] = {0, 1, 2, 3};
10406 PSHUFDMask[ADWord] = BDWord;
10407 PSHUFDMask[BDWord] = ADWord;
10408 V = DAG.getBitcast(
10410 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10411 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10413 // Adjust the mask to match the new locations of A and B.
10414 for (int &M : Mask)
10415 if (M >= 0 && M/2 == ADWord)
10416 M = 2 * BDWord + M % 2;
10417 else if (M >= 0 && M/2 == BDWord)
10418 M = 2 * ADWord + M % 2;
10420 // Recurse back into this routine to re-compute state now that this isn't
10421 // a 3 and 1 problem.
10422 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10425 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10426 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10427 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10428 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10430 // At this point there are at most two inputs to the low and high halves from
10431 // each half. That means the inputs can always be grouped into dwords and
10432 // those dwords can then be moved to the correct half with a dword shuffle.
10433 // We use at most one low and one high word shuffle to collect these paired
10434 // inputs into dwords, and finally a dword shuffle to place them.
10435 int PSHUFLMask[4] = {-1, -1, -1, -1};
10436 int PSHUFHMask[4] = {-1, -1, -1, -1};
10437 int PSHUFDMask[4] = {-1, -1, -1, -1};
10439 // First fix the masks for all the inputs that are staying in their
10440 // original halves. This will then dictate the targets of the cross-half
10442 auto fixInPlaceInputs =
10443 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10444 MutableArrayRef<int> SourceHalfMask,
10445 MutableArrayRef<int> HalfMask, int HalfOffset) {
10446 if (InPlaceInputs.empty())
10448 if (InPlaceInputs.size() == 1) {
10449 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10450 InPlaceInputs[0] - HalfOffset;
10451 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10454 if (IncomingInputs.empty()) {
10455 // Just fix all of the in place inputs.
10456 for (int Input : InPlaceInputs) {
10457 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10458 PSHUFDMask[Input / 2] = Input / 2;
10463 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10464 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10465 InPlaceInputs[0] - HalfOffset;
10466 // Put the second input next to the first so that they are packed into
10467 // a dword. We find the adjacent index by toggling the low bit.
10468 int AdjIndex = InPlaceInputs[0] ^ 1;
10469 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10470 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10471 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10473 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10474 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10476 // Now gather the cross-half inputs and place them into a free dword of
10477 // their target half.
10478 // FIXME: This operation could almost certainly be simplified dramatically to
10479 // look more like the 3-1 fixing operation.
10480 auto moveInputsToRightHalf = [&PSHUFDMask](
10481 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10482 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10483 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10485 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10486 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10488 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10490 int LowWord = Word & ~1;
10491 int HighWord = Word | 1;
10492 return isWordClobbered(SourceHalfMask, LowWord) ||
10493 isWordClobbered(SourceHalfMask, HighWord);
10496 if (IncomingInputs.empty())
10499 if (ExistingInputs.empty()) {
10500 // Map any dwords with inputs from them into the right half.
10501 for (int Input : IncomingInputs) {
10502 // If the source half mask maps over the inputs, turn those into
10503 // swaps and use the swapped lane.
10504 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10505 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10506 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10507 Input - SourceOffset;
10508 // We have to swap the uses in our half mask in one sweep.
10509 for (int &M : HalfMask)
10510 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10512 else if (M == Input)
10513 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10515 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10516 Input - SourceOffset &&
10517 "Previous placement doesn't match!");
10519 // Note that this correctly re-maps both when we do a swap and when
10520 // we observe the other side of the swap above. We rely on that to
10521 // avoid swapping the members of the input list directly.
10522 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10525 // Map the input's dword into the correct half.
10526 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10527 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10529 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10531 "Previous placement doesn't match!");
10534 // And just directly shift any other-half mask elements to be same-half
10535 // as we will have mirrored the dword containing the element into the
10536 // same position within that half.
10537 for (int &M : HalfMask)
10538 if (M >= SourceOffset && M < SourceOffset + 4) {
10539 M = M - SourceOffset + DestOffset;
10540 assert(M >= 0 && "This should never wrap below zero!");
10545 // Ensure we have the input in a viable dword of its current half. This
10546 // is particularly tricky because the original position may be clobbered
10547 // by inputs being moved and *staying* in that half.
10548 if (IncomingInputs.size() == 1) {
10549 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10550 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
10552 SourceHalfMask[InputFixed - SourceOffset] =
10553 IncomingInputs[0] - SourceOffset;
10554 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
10556 IncomingInputs[0] = InputFixed;
10558 } else if (IncomingInputs.size() == 2) {
10559 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
10560 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10561 // We have two non-adjacent or clobbered inputs we need to extract from
10562 // the source half. To do this, we need to map them into some adjacent
10563 // dword slot in the source mask.
10564 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
10565 IncomingInputs[1] - SourceOffset};
10567 // If there is a free slot in the source half mask adjacent to one of
10568 // the inputs, place the other input in it. We use (Index XOR 1) to
10569 // compute an adjacent index.
10570 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
10571 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
10572 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
10573 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10574 InputsFixed[1] = InputsFixed[0] ^ 1;
10575 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
10576 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
10577 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
10578 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
10579 InputsFixed[0] = InputsFixed[1] ^ 1;
10580 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
10581 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
10582 // The two inputs are in the same DWord but it is clobbered and the
10583 // adjacent DWord isn't used at all. Move both inputs to the free
10585 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
10586 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
10587 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
10588 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
10590 // The only way we hit this point is if there is no clobbering
10591 // (because there are no off-half inputs to this half) and there is no
10592 // free slot adjacent to one of the inputs. In this case, we have to
10593 // swap an input with a non-input.
10594 for (int i = 0; i < 4; ++i)
10595 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
10596 "We can't handle any clobbers here!");
10597 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
10598 "Cannot have adjacent inputs here!");
10600 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10601 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
10603 // We also have to update the final source mask in this case because
10604 // it may need to undo the above swap.
10605 for (int &M : FinalSourceHalfMask)
10606 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
10607 M = InputsFixed[1] + SourceOffset;
10608 else if (M == InputsFixed[1] + SourceOffset)
10609 M = (InputsFixed[0] ^ 1) + SourceOffset;
10611 InputsFixed[1] = InputsFixed[0] ^ 1;
10614 // Point everything at the fixed inputs.
10615 for (int &M : HalfMask)
10616 if (M == IncomingInputs[0])
10617 M = InputsFixed[0] + SourceOffset;
10618 else if (M == IncomingInputs[1])
10619 M = InputsFixed[1] + SourceOffset;
10621 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
10622 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
10625 llvm_unreachable("Unhandled input size!");
10628 // Now hoist the DWord down to the right half.
10629 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
10630 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
10631 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
10632 for (int &M : HalfMask)
10633 for (int Input : IncomingInputs)
10635 M = FreeDWord * 2 + Input % 2;
10637 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
10638 /*SourceOffset*/ 4, /*DestOffset*/ 0);
10639 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
10640 /*SourceOffset*/ 0, /*DestOffset*/ 4);
10642 // Now enact all the shuffles we've computed to move the inputs into their
10644 if (!isNoopShuffleMask(PSHUFLMask))
10645 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10646 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
10647 if (!isNoopShuffleMask(PSHUFHMask))
10648 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10649 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
10650 if (!isNoopShuffleMask(PSHUFDMask))
10651 V = DAG.getBitcast(
10653 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10654 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10656 // At this point, each half should contain all its inputs, and we can then
10657 // just shuffle them into their final position.
10658 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
10659 "Failed to lift all the high half inputs to the low mask!");
10660 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
10661 "Failed to lift all the low half inputs to the high mask!");
10663 // Do a half shuffle for the low mask.
10664 if (!isNoopShuffleMask(LoMask))
10665 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10666 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
10668 // Do a half shuffle with the high mask after shifting its values down.
10669 for (int &M : HiMask)
10672 if (!isNoopShuffleMask(HiMask))
10673 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10674 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
10679 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
10680 /// blend if only one input is used.
10681 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
10682 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10683 const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
10685 SDValue V1Mask[16];
10686 SDValue V2Mask[16];
10690 int Size = Mask.size();
10691 int Scale = 16 / Size;
10692 for (int i = 0; i < 16; ++i) {
10693 if (Mask[i / Scale] < 0) {
10694 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
10696 const int ZeroMask = 0x80;
10697 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
10699 int V2Idx = Mask[i / Scale] < Size
10701 : (Mask[i / Scale] - Size) * Scale + i % Scale;
10702 if (Zeroable[i / Scale])
10703 V1Idx = V2Idx = ZeroMask;
10704 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
10705 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
10706 V1InUse |= (ZeroMask != V1Idx);
10707 V2InUse |= (ZeroMask != V2Idx);
10712 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10713 DAG.getBitcast(MVT::v16i8, V1),
10714 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
10716 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10717 DAG.getBitcast(MVT::v16i8, V2),
10718 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
10720 // If we need shuffled inputs from both, blend the two.
10722 if (V1InUse && V2InUse)
10723 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
10725 V = V1InUse ? V1 : V2;
10727 // Cast the result back to the correct type.
10728 return DAG.getBitcast(VT, V);
10731 /// \brief Generic lowering of 8-lane i16 shuffles.
10733 /// This handles both single-input shuffles and combined shuffle/blends with
10734 /// two inputs. The single input shuffles are immediately delegated to
10735 /// a dedicated lowering routine.
10737 /// The blends are lowered in one of three fundamental ways. If there are few
10738 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
10739 /// of the input is significantly cheaper when lowered as an interleaving of
10740 /// the two inputs, try to interleave them. Otherwise, blend the low and high
10741 /// halves of the inputs separately (making them have relatively few inputs)
10742 /// and then concatenate them.
10743 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10744 const SmallBitVector &Zeroable,
10745 SDValue V1, SDValue V2,
10746 const X86Subtarget &Subtarget,
10747 SelectionDAG &DAG) {
10748 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10749 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10750 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10752 // Whenever we can lower this as a zext, that instruction is strictly faster
10753 // than any alternative.
10754 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10755 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10758 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
10760 if (NumV2Inputs == 0) {
10761 // Check for being able to broadcast a single element.
10762 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10763 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10766 // Try to use shift instructions.
10767 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
10768 Zeroable, Subtarget, DAG))
10771 // Use dedicated unpack instructions for masks that match their pattern.
10773 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10776 // Try to use byte rotation instructions.
10777 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
10778 Mask, Subtarget, DAG))
10781 // Make a copy of the mask so it can be modified.
10782 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
10783 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
10784 MutableMask, Subtarget,
10788 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
10789 "All single-input shuffles should be canonicalized to be V1-input "
10792 // Try to use shift instructions.
10793 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
10794 Zeroable, Subtarget, DAG))
10797 // See if we can use SSE4A Extraction / Insertion.
10798 if (Subtarget.hasSSE4A())
10799 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
10803 // There are special ways we can lower some single-element blends.
10804 if (NumV2Inputs == 1)
10805 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10806 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10809 // We have different paths for blend lowering, but they all must use the
10810 // *exact* same predicate.
10811 bool IsBlendSupported = Subtarget.hasSSE41();
10812 if (IsBlendSupported)
10813 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
10814 Zeroable, Subtarget, DAG))
10817 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
10821 // Use dedicated unpack instructions for masks that match their pattern.
10823 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10826 // Try to use byte rotation instructions.
10827 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10828 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10831 if (SDValue BitBlend =
10832 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10835 // Try to lower by permuting the inputs into an unpack instruction.
10836 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10840 // If we can't directly blend but can use PSHUFB, that will be better as it
10841 // can both shuffle and set up the inefficient blend.
10842 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10843 bool V1InUse, V2InUse;
10844 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
10845 Zeroable, DAG, V1InUse, V2InUse);
10848 // We can always bit-blend if we have to so the fallback strategy is to
10849 // decompose into single-input permutes and blends.
10850 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10854 /// \brief Check whether a compaction lowering can be done by dropping even
10855 /// elements and compute how many times even elements must be dropped.
10857 /// This handles shuffles which take every Nth element where N is a power of
10858 /// two. Example shuffle masks:
10860 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10861 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10862 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10863 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10864 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10865 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10867 /// Any of these lanes can of course be undef.
10869 /// This routine only supports N <= 3.
10870 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10873 /// \returns N above, or the number of times even elements must be dropped if
10874 /// there is such a number. Otherwise returns zero.
10875 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10876 bool IsSingleInput) {
10877 // The modulus for the shuffle vector entries is based on whether this is
10878 // a single input or not.
10879 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10880 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10881 "We should only be called with masks with a power-of-2 size!");
10883 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10885 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10886 // and 2^3 simultaneously. This is because we may have ambiguity with
10887 // partially undef inputs.
10888 bool ViableForN[3] = {true, true, true};
10890 for (int i = 0, e = Mask.size(); i < e; ++i) {
10891 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10896 bool IsAnyViable = false;
10897 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10898 if (ViableForN[j]) {
10899 uint64_t N = j + 1;
10901 // The shuffle mask must be equal to (i * 2^N) % M.
10902 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10903 IsAnyViable = true;
10905 ViableForN[j] = false;
10907 // Early exit if we exhaust the possible powers of two.
10912 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10916 // Return 0 as there is no viable power of two.
10920 /// \brief Generic lowering of v16i8 shuffles.
10922 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10923 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10924 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10925 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10927 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10928 const SmallBitVector &Zeroable,
10929 SDValue V1, SDValue V2,
10930 const X86Subtarget &Subtarget,
10931 SelectionDAG &DAG) {
10932 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10933 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10934 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10936 // Try to use shift instructions.
10937 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10938 Zeroable, Subtarget, DAG))
10941 // Try to use byte rotation instructions.
10942 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10943 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10946 // Try to use a zext lowering.
10947 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10948 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
10951 // See if we can use SSE4A Extraction / Insertion.
10952 if (Subtarget.hasSSE4A())
10953 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
10957 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10959 // For single-input shuffles, there are some nicer lowering tricks we can use.
10960 if (NumV2Elements == 0) {
10961 // Check for being able to broadcast a single element.
10962 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10963 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10966 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10967 // Notably, this handles splat and partial-splat shuffles more efficiently.
10968 // However, it only makes sense if the pre-duplication shuffle simplifies
10969 // things significantly. Currently, this means we need to be able to
10970 // express the pre-duplication shuffle as an i16 shuffle.
10972 // FIXME: We should check for other patterns which can be widened into an
10973 // i16 shuffle as well.
10974 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10975 for (int i = 0; i < 16; i += 2)
10976 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10981 auto tryToWidenViaDuplication = [&]() -> SDValue {
10982 if (!canWidenViaDuplication(Mask))
10984 SmallVector<int, 4> LoInputs;
10985 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10986 [](int M) { return M >= 0 && M < 8; });
10987 std::sort(LoInputs.begin(), LoInputs.end());
10988 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10990 SmallVector<int, 4> HiInputs;
10991 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10992 [](int M) { return M >= 8; });
10993 std::sort(HiInputs.begin(), HiInputs.end());
10994 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10997 bool TargetLo = LoInputs.size() >= HiInputs.size();
10998 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10999 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11001 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11002 SmallDenseMap<int, int, 8> LaneMap;
11003 for (int I : InPlaceInputs) {
11004 PreDupI16Shuffle[I/2] = I/2;
11007 int j = TargetLo ? 0 : 4, je = j + 4;
11008 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11009 // Check if j is already a shuffle of this input. This happens when
11010 // there are two adjacent bytes after we move the low one.
11011 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11012 // If we haven't yet mapped the input, search for a slot into which
11014 while (j < je && PreDupI16Shuffle[j] >= 0)
11018 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11021 // Map this input with the i16 shuffle.
11022 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11025 // Update the lane map based on the mapping we ended up with.
11026 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11028 V1 = DAG.getBitcast(
11030 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11031 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11033 // Unpack the bytes to form the i16s that will be shuffled into place.
11034 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11035 MVT::v16i8, V1, V1);
11037 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11038 for (int i = 0; i < 16; ++i)
11039 if (Mask[i] >= 0) {
11040 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11041 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11042 if (PostDupI16Shuffle[i / 2] < 0)
11043 PostDupI16Shuffle[i / 2] = MappedMask;
11045 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11046 "Conflicting entrties in the original shuffle!");
11048 return DAG.getBitcast(
11050 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11051 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11053 if (SDValue V = tryToWidenViaDuplication())
11057 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11061 // Use dedicated unpack instructions for masks that match their pattern.
11063 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11066 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11067 // with PSHUFB. It is important to do this before we attempt to generate any
11068 // blends but after all of the single-input lowerings. If the single input
11069 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11070 // want to preserve that and we can DAG combine any longer sequences into
11071 // a PSHUFB in the end. But once we start blending from multiple inputs,
11072 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11073 // and there are *very* few patterns that would actually be faster than the
11074 // PSHUFB approach because of its ability to zero lanes.
11076 // FIXME: The only exceptions to the above are blends which are exact
11077 // interleavings with direct instructions supporting them. We currently don't
11078 // handle those well here.
11079 if (Subtarget.hasSSSE3()) {
11080 bool V1InUse = false;
11081 bool V2InUse = false;
11083 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11084 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11086 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11087 // do so. This avoids using them to handle blends-with-zero which is
11088 // important as a single pshufb is significantly faster for that.
11089 if (V1InUse && V2InUse) {
11090 if (Subtarget.hasSSE41())
11091 if (SDValue Blend = lowerVectorShuffleAsBlend(
11092 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11095 // We can use an unpack to do the blending rather than an or in some
11096 // cases. Even though the or may be (very minorly) more efficient, we
11097 // preference this lowering because there are common cases where part of
11098 // the complexity of the shuffles goes away when we do the final blend as
11100 // FIXME: It might be worth trying to detect if the unpack-feeding
11101 // shuffles will both be pshufb, in which case we shouldn't bother with
11103 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11104 DL, MVT::v16i8, V1, V2, Mask, DAG))
11111 // There are special ways we can lower some single-element blends.
11112 if (NumV2Elements == 1)
11113 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11114 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11117 if (SDValue BitBlend =
11118 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11121 // Check whether a compaction lowering can be done. This handles shuffles
11122 // which take every Nth element for some even N. See the helper function for
11125 // We special case these as they can be particularly efficiently handled with
11126 // the PACKUSB instruction on x86 and they show up in common patterns of
11127 // rearranging bytes to truncate wide elements.
11128 bool IsSingleInput = V2.isUndef();
11129 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11130 // NumEvenDrops is the power of two stride of the elements. Another way of
11131 // thinking about it is that we need to drop the even elements this many
11132 // times to get the original input.
11134 // First we need to zero all the dropped bytes.
11135 assert(NumEvenDrops <= 3 &&
11136 "No support for dropping even elements more than 3 times.");
11137 // We use the mask type to pick which bytes are preserved based on how many
11138 // elements are dropped.
11139 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11140 SDValue ByteClearMask = DAG.getBitcast(
11141 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11142 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11143 if (!IsSingleInput)
11144 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11146 // Now pack things back together.
11147 V1 = DAG.getBitcast(MVT::v8i16, V1);
11148 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11149 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11150 for (int i = 1; i < NumEvenDrops; ++i) {
11151 Result = DAG.getBitcast(MVT::v8i16, Result);
11152 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11158 // Handle multi-input cases by blending single-input shuffles.
11159 if (NumV2Elements > 0)
11160 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11163 // The fallback path for single-input shuffles widens this into two v8i16
11164 // vectors with unpacks, shuffles those, and then pulls them back together
11168 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11169 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11170 for (int i = 0; i < 16; ++i)
11172 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11174 SDValue VLoHalf, VHiHalf;
11175 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11176 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11178 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11179 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11180 // Use a mask to drop the high bytes.
11181 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11182 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11183 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11185 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11186 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11188 // Squash the masks to point directly into VLoHalf.
11189 for (int &M : LoBlendMask)
11192 for (int &M : HiBlendMask)
11196 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11197 // VHiHalf so that we can blend them as i16s.
11198 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11200 VLoHalf = DAG.getBitcast(
11201 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11202 VHiHalf = DAG.getBitcast(
11203 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11206 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11207 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11209 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11212 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11214 /// This routine breaks down the specific type of 128-bit shuffle and
11215 /// dispatches to the lowering routines accordingly.
11216 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11217 MVT VT, SDValue V1, SDValue V2,
11218 const SmallBitVector &Zeroable,
11219 const X86Subtarget &Subtarget,
11220 SelectionDAG &DAG) {
11221 switch (VT.SimpleTy) {
11223 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11225 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11227 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11229 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11231 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11233 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11236 llvm_unreachable("Unimplemented!");
11240 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11242 /// This routine just extracts two subvectors, shuffles them independently, and
11243 /// then concatenates them back together. This should work effectively with all
11244 /// AVX vector shuffle types.
11245 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11246 SDValue V2, ArrayRef<int> Mask,
11247 SelectionDAG &DAG) {
11248 assert(VT.getSizeInBits() >= 256 &&
11249 "Only for 256-bit or wider vector shuffles!");
11250 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11251 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11253 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11254 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11256 int NumElements = VT.getVectorNumElements();
11257 int SplitNumElements = NumElements / 2;
11258 MVT ScalarVT = VT.getVectorElementType();
11259 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11261 // Rather than splitting build-vectors, just build two narrower build
11262 // vectors. This helps shuffling with splats and zeros.
11263 auto SplitVector = [&](SDValue V) {
11264 V = peekThroughBitcasts(V);
11266 MVT OrigVT = V.getSimpleValueType();
11267 int OrigNumElements = OrigVT.getVectorNumElements();
11268 int OrigSplitNumElements = OrigNumElements / 2;
11269 MVT OrigScalarVT = OrigVT.getVectorElementType();
11270 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11274 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11276 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11277 DAG.getIntPtrConstant(0, DL));
11278 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11279 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11282 SmallVector<SDValue, 16> LoOps, HiOps;
11283 for (int i = 0; i < OrigSplitNumElements; ++i) {
11284 LoOps.push_back(BV->getOperand(i));
11285 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11287 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11288 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11290 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11291 DAG.getBitcast(SplitVT, HiV));
11294 SDValue LoV1, HiV1, LoV2, HiV2;
11295 std::tie(LoV1, HiV1) = SplitVector(V1);
11296 std::tie(LoV2, HiV2) = SplitVector(V2);
11298 // Now create two 4-way blends of these half-width vectors.
11299 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11300 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11301 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11302 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11303 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11304 for (int i = 0; i < SplitNumElements; ++i) {
11305 int M = HalfMask[i];
11306 if (M >= NumElements) {
11307 if (M >= NumElements + SplitNumElements)
11311 V2BlendMask[i] = M - NumElements;
11312 BlendMask[i] = SplitNumElements + i;
11313 } else if (M >= 0) {
11314 if (M >= SplitNumElements)
11318 V1BlendMask[i] = M;
11323 // Because the lowering happens after all combining takes place, we need to
11324 // manually combine these blend masks as much as possible so that we create
11325 // a minimal number of high-level vector shuffle nodes.
11327 // First try just blending the halves of V1 or V2.
11328 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11329 return DAG.getUNDEF(SplitVT);
11330 if (!UseLoV2 && !UseHiV2)
11331 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11332 if (!UseLoV1 && !UseHiV1)
11333 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11335 SDValue V1Blend, V2Blend;
11336 if (UseLoV1 && UseHiV1) {
11338 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11340 // We only use half of V1 so map the usage down into the final blend mask.
11341 V1Blend = UseLoV1 ? LoV1 : HiV1;
11342 for (int i = 0; i < SplitNumElements; ++i)
11343 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11344 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11346 if (UseLoV2 && UseHiV2) {
11348 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11350 // We only use half of V2 so map the usage down into the final blend mask.
11351 V2Blend = UseLoV2 ? LoV2 : HiV2;
11352 for (int i = 0; i < SplitNumElements; ++i)
11353 if (BlendMask[i] >= SplitNumElements)
11354 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11356 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11358 SDValue Lo = HalfBlend(LoMask);
11359 SDValue Hi = HalfBlend(HiMask);
11360 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11363 /// \brief Either split a vector in halves or decompose the shuffles and the
11366 /// This is provided as a good fallback for many lowerings of non-single-input
11367 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11368 /// between splitting the shuffle into 128-bit components and stitching those
11369 /// back together vs. extracting the single-input shuffles and blending those
11371 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11372 SDValue V1, SDValue V2,
11373 ArrayRef<int> Mask,
11374 SelectionDAG &DAG) {
11375 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11376 "shuffles as it could then recurse on itself.");
11377 int Size = Mask.size();
11379 // If this can be modeled as a broadcast of two elements followed by a blend,
11380 // prefer that lowering. This is especially important because broadcasts can
11381 // often fold with memory operands.
11382 auto DoBothBroadcast = [&] {
11383 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11386 if (V2BroadcastIdx < 0)
11387 V2BroadcastIdx = M - Size;
11388 else if (M - Size != V2BroadcastIdx)
11390 } else if (M >= 0) {
11391 if (V1BroadcastIdx < 0)
11392 V1BroadcastIdx = M;
11393 else if (M != V1BroadcastIdx)
11398 if (DoBothBroadcast())
11399 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11402 // If the inputs all stem from a single 128-bit lane of each input, then we
11403 // split them rather than blending because the split will decompose to
11404 // unusually few instructions.
11405 int LaneCount = VT.getSizeInBits() / 128;
11406 int LaneSize = Size / LaneCount;
11407 SmallBitVector LaneInputs[2];
11408 LaneInputs[0].resize(LaneCount, false);
11409 LaneInputs[1].resize(LaneCount, false);
11410 for (int i = 0; i < Size; ++i)
11412 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11413 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11414 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11416 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11417 // that the decomposed single-input shuffles don't end up here.
11418 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11421 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11422 /// a permutation and blend of those lanes.
11424 /// This essentially blends the out-of-lane inputs to each lane into the lane
11425 /// from a permuted copy of the vector. This lowering strategy results in four
11426 /// instructions in the worst case for a single-input cross lane shuffle which
11427 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11428 /// of. Special cases for each particular shuffle pattern should be handled
11429 /// prior to trying this lowering.
11430 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11431 SDValue V1, SDValue V2,
11432 ArrayRef<int> Mask,
11433 SelectionDAG &DAG) {
11434 // FIXME: This should probably be generalized for 512-bit vectors as well.
11435 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11436 int Size = Mask.size();
11437 int LaneSize = Size / 2;
11439 // If there are only inputs from one 128-bit lane, splitting will in fact be
11440 // less expensive. The flags track whether the given lane contains an element
11441 // that crosses to another lane.
11442 bool LaneCrossing[2] = {false, false};
11443 for (int i = 0; i < Size; ++i)
11444 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11445 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11446 if (!LaneCrossing[0] || !LaneCrossing[1])
11447 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11449 assert(V2.isUndef() &&
11450 "This last part of this routine only works on single input shuffles");
11452 SmallVector<int, 32> FlippedBlendMask(Size);
11453 for (int i = 0; i < Size; ++i)
11454 FlippedBlendMask[i] =
11455 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11457 : Mask[i] % LaneSize +
11458 (i / LaneSize) * LaneSize + Size);
11460 // Flip the vector, and blend the results which should now be in-lane. The
11461 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11462 // 5 for the high source. The value 3 selects the high half of source 2 and
11463 // the value 2 selects the low half of source 2. We only use source 2 to
11464 // allow folding it into a memory operand.
11465 unsigned PERMMask = 3 | 2 << 4;
11466 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11467 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11468 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11471 /// \brief Handle lowering 2-lane 128-bit shuffles.
11472 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11473 SDValue V2, ArrayRef<int> Mask,
11474 const SmallBitVector &Zeroable,
11475 const X86Subtarget &Subtarget,
11476 SelectionDAG &DAG) {
11477 SmallVector<int, 4> WidenedMask;
11478 if (!canWidenShuffleElements(Mask, WidenedMask))
11481 // TODO: If minimizing size and one of the inputs is a zero vector and the
11482 // the zero vector has only one use, we could use a VPERM2X128 to save the
11483 // instruction bytes needed to explicitly generate the zero vector.
11485 // Blends are faster and handle all the non-lane-crossing cases.
11486 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11487 Zeroable, Subtarget, DAG))
11490 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11491 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11493 // If either input operand is a zero vector, use VPERM2X128 because its mask
11494 // allows us to replace the zero input with an implicit zero.
11495 if (!IsV1Zero && !IsV2Zero) {
11496 // Check for patterns which can be matched with a single insert of a 128-bit
11498 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11499 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11500 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11501 if (Subtarget.hasAVX2() && V2.isUndef())
11504 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11505 VT.getVectorNumElements() / 2);
11506 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11507 DAG.getIntPtrConstant(0, DL));
11508 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11509 OnlyUsesV1 ? V1 : V2,
11510 DAG.getIntPtrConstant(0, DL));
11511 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11515 // Otherwise form a 128-bit permutation. After accounting for undefs,
11516 // convert the 64-bit shuffle mask selection values into 128-bit
11517 // selection bits by dividing the indexes by 2 and shifting into positions
11518 // defined by a vperm2*128 instruction's immediate control byte.
11520 // The immediate permute control byte looks like this:
11521 // [1:0] - select 128 bits from sources for low half of destination
11523 // [3] - zero low half of destination
11524 // [5:4] - select 128 bits from sources for high half of destination
11526 // [7] - zero high half of destination
11528 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11529 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11531 unsigned PermMask = MaskLO | (MaskHI << 4);
11533 // If either input is a zero vector, replace it with an undef input.
11534 // Shuffle mask values < 4 are selecting elements of V1.
11535 // Shuffle mask values >= 4 are selecting elements of V2.
11536 // Adjust each half of the permute mask by clearing the half that was
11537 // selecting the zero vector and setting the zero mask bit.
11539 V1 = DAG.getUNDEF(VT);
11541 PermMask = (PermMask & 0xf0) | 0x08;
11543 PermMask = (PermMask & 0x0f) | 0x80;
11546 V2 = DAG.getUNDEF(VT);
11548 PermMask = (PermMask & 0xf0) | 0x08;
11550 PermMask = (PermMask & 0x0f) | 0x80;
11553 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
11554 DAG.getConstant(PermMask, DL, MVT::i8));
11557 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
11558 /// shuffling each lane.
11560 /// This will only succeed when the result of fixing the 128-bit lanes results
11561 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
11562 /// each 128-bit lanes. This handles many cases where we can quickly blend away
11563 /// the lane crosses early and then use simpler shuffles within each lane.
11565 /// FIXME: It might be worthwhile at some point to support this without
11566 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
11567 /// in x86 only floating point has interesting non-repeating shuffles, and even
11568 /// those are still *marginally* more expensive.
11569 static SDValue lowerVectorShuffleByMerging128BitLanes(
11570 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11571 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11572 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
11574 int Size = Mask.size();
11575 int LaneSize = 128 / VT.getScalarSizeInBits();
11576 int NumLanes = Size / LaneSize;
11577 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
11579 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
11580 // check whether the in-128-bit lane shuffles share a repeating pattern.
11581 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
11582 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
11583 for (int i = 0; i < Size; ++i) {
11587 int j = i / LaneSize;
11589 if (Lanes[j] < 0) {
11590 // First entry we've seen for this lane.
11591 Lanes[j] = Mask[i] / LaneSize;
11592 } else if (Lanes[j] != Mask[i] / LaneSize) {
11593 // This doesn't match the lane selected previously!
11597 // Check that within each lane we have a consistent shuffle mask.
11598 int k = i % LaneSize;
11599 if (InLaneMask[k] < 0) {
11600 InLaneMask[k] = Mask[i] % LaneSize;
11601 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
11602 // This doesn't fit a repeating in-lane mask.
11607 // First shuffle the lanes into place.
11608 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
11609 VT.getSizeInBits() / 64);
11610 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
11611 for (int i = 0; i < NumLanes; ++i)
11612 if (Lanes[i] >= 0) {
11613 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
11614 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
11617 V1 = DAG.getBitcast(LaneVT, V1);
11618 V2 = DAG.getBitcast(LaneVT, V2);
11619 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
11621 // Cast it back to the type we actually want.
11622 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
11624 // Now do a simple shuffle that isn't lane crossing.
11625 SmallVector<int, 8> NewMask((unsigned)Size, -1);
11626 for (int i = 0; i < Size; ++i)
11628 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
11629 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
11630 "Must not introduce lane crosses at this point!");
11632 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
11635 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
11636 /// This allows for fast cases such as subvector extraction/insertion
11637 /// or shuffling smaller vector types which can lower more efficiently.
11638 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
11639 SDValue V1, SDValue V2,
11640 ArrayRef<int> Mask,
11641 const X86Subtarget &Subtarget,
11642 SelectionDAG &DAG) {
11643 assert(VT.is256BitVector() && "Expected 256-bit vector");
11645 unsigned NumElts = VT.getVectorNumElements();
11646 unsigned HalfNumElts = NumElts / 2;
11647 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
11649 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
11650 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
11651 if (!UndefLower && !UndefUpper)
11654 // Upper half is undef and lower half is whole upper subvector.
11655 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11657 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
11658 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11659 DAG.getIntPtrConstant(HalfNumElts, DL));
11660 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11661 DAG.getIntPtrConstant(0, DL));
11664 // Lower half is undef and upper half is whole lower subvector.
11665 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11667 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
11668 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11669 DAG.getIntPtrConstant(0, DL));
11670 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11671 DAG.getIntPtrConstant(HalfNumElts, DL));
11674 // If the shuffle only uses two of the four halves of the input operands,
11675 // then extract them and perform the 'half' shuffle at half width.
11676 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
11677 int HalfIdx1 = -1, HalfIdx2 = -1;
11678 SmallVector<int, 8> HalfMask(HalfNumElts);
11679 unsigned Offset = UndefLower ? HalfNumElts : 0;
11680 for (unsigned i = 0; i != HalfNumElts; ++i) {
11681 int M = Mask[i + Offset];
11687 // Determine which of the 4 half vectors this element is from.
11688 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
11689 int HalfIdx = M / HalfNumElts;
11691 // Determine the element index into its half vector source.
11692 int HalfElt = M % HalfNumElts;
11694 // We can shuffle with up to 2 half vectors, set the new 'half'
11695 // shuffle mask accordingly.
11696 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
11697 HalfMask[i] = HalfElt;
11698 HalfIdx1 = HalfIdx;
11701 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
11702 HalfMask[i] = HalfElt + HalfNumElts;
11703 HalfIdx2 = HalfIdx;
11707 // Too many half vectors referenced.
11710 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
11712 // Only shuffle the halves of the inputs when useful.
11713 int NumLowerHalves =
11714 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
11715 int NumUpperHalves =
11716 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
11718 // uuuuXXXX - don't extract uppers just to insert again.
11719 if (UndefLower && NumUpperHalves != 0)
11722 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
11723 if (UndefUpper && NumUpperHalves == 2)
11726 // AVX2 - XXXXuuuu - always extract lowers.
11727 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
11728 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
11729 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11731 // AVX2 supports variable 32-bit element cross-lane shuffles.
11732 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
11733 // XXXXuuuu - don't extract lowers and uppers.
11734 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
11739 auto GetHalfVector = [&](int HalfIdx) {
11741 return DAG.getUNDEF(HalfVT);
11742 SDValue V = (HalfIdx < 2 ? V1 : V2);
11743 HalfIdx = (HalfIdx % 2) * HalfNumElts;
11744 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
11745 DAG.getIntPtrConstant(HalfIdx, DL));
11748 SDValue Half1 = GetHalfVector(HalfIdx1);
11749 SDValue Half2 = GetHalfVector(HalfIdx2);
11750 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
11751 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
11752 DAG.getIntPtrConstant(Offset, DL));
11755 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
11758 /// This returns true if the elements from a particular input are already in the
11759 /// slot required by the given mask and require no permutation.
11760 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
11761 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
11762 int Size = Mask.size();
11763 for (int i = 0; i < Size; ++i)
11764 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
11770 /// Handle case where shuffle sources are coming from the same 128-bit lane and
11771 /// every lane can be represented as the same repeating mask - allowing us to
11772 /// shuffle the sources with the repeating shuffle and then permute the result
11773 /// to the destination lanes.
11774 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
11775 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11776 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11777 int NumElts = VT.getVectorNumElements();
11778 int NumLanes = VT.getSizeInBits() / 128;
11779 int NumLaneElts = NumElts / NumLanes;
11781 // On AVX2 we may be able to just shuffle the lowest elements and then
11782 // broadcast the result.
11783 if (Subtarget.hasAVX2()) {
11784 for (unsigned BroadcastSize : {16, 32, 64}) {
11785 if (BroadcastSize <= VT.getScalarSizeInBits())
11787 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11789 // Attempt to match a repeating pattern every NumBroadcastElts,
11790 // accounting for UNDEFs but only references the lowest 128-bit
11791 // lane of the inputs.
11792 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11793 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11794 for (int j = 0; j != NumBroadcastElts; ++j) {
11795 int M = Mask[i + j];
11798 int &R = RepeatMask[j];
11799 if (0 != ((M % NumElts) / NumLaneElts))
11801 if (0 <= R && R != M)
11808 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11809 if (!FindRepeatingBroadcastMask(RepeatMask))
11812 // Shuffle the (lowest) repeated elements in place for broadcast.
11813 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11815 // Shuffle the actual broadcast.
11816 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11817 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11818 for (int j = 0; j != NumBroadcastElts; ++j)
11819 BroadcastMask[i + j] = j;
11820 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11825 // Bail if the shuffle mask doesn't cross 128-bit lanes.
11826 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11829 // Bail if we already have a repeated lane shuffle mask.
11830 SmallVector<int, 8> RepeatedShuffleMask;
11831 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11834 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11835 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11836 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11837 int NumSubLanes = NumLanes * SubLaneScale;
11838 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11840 // Check that all the sources are coming from the same lane and see if we can
11841 // form a repeating shuffle mask (local to each sub-lane). At the same time,
11842 // determine the source sub-lane for each destination sub-lane.
11843 int TopSrcSubLane = -1;
11844 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11845 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11846 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11847 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11849 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
11850 // Extract the sub-lane mask, check that it all comes from the same lane
11851 // and normalize the mask entries to come from the first lane.
11853 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
11854 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11855 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
11858 int Lane = (M % NumElts) / NumLaneElts;
11859 if ((0 <= SrcLane) && (SrcLane != Lane))
11862 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
11863 SubLaneMask[Elt] = LocalM;
11866 // Whole sub-lane is UNDEF.
11870 // Attempt to match against the candidate repeated sub-lane masks.
11871 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
11872 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
11873 for (int i = 0; i != NumSubLaneElts; ++i) {
11874 if (M1[i] < 0 || M2[i] < 0)
11876 if (M1[i] != M2[i])
11882 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
11883 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
11886 // Merge the sub-lane mask into the matching repeated sub-lane mask.
11887 for (int i = 0; i != NumSubLaneElts; ++i) {
11888 int M = SubLaneMask[i];
11891 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
11892 "Unexpected mask element");
11893 RepeatedSubLaneMask[i] = M;
11896 // Track the top most source sub-lane - by setting the remaining to UNDEF
11897 // we can greatly simplify shuffle matching.
11898 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
11899 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11900 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
11904 // Bail if we failed to find a matching repeated sub-lane mask.
11905 if (Dst2SrcSubLanes[DstSubLane] < 0)
11908 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11909 "Unexpected source lane");
11911 // Create a repeating shuffle mask for the entire vector.
11912 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11913 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
11914 int Lane = SubLane / SubLaneScale;
11915 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
11916 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11917 int M = RepeatedSubLaneMask[Elt];
11920 int Idx = (SubLane * NumSubLaneElts) + Elt;
11921 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
11924 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11926 // Shuffle each source sub-lane to its destination.
11927 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11928 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11929 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11930 if (SrcSubLane < 0)
11932 for (int j = 0; j != NumSubLaneElts; ++j)
11933 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11936 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11940 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
11941 unsigned &ShuffleImm,
11942 ArrayRef<int> Mask) {
11943 int NumElts = VT.getVectorNumElements();
11944 assert(VT.getScalarType() == MVT::f64 &&
11945 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
11946 "Unexpected data type for VSHUFPD");
11948 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
11949 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
11951 bool ShufpdMask = true;
11952 bool CommutableMask = true;
11953 for (int i = 0; i < NumElts; ++i) {
11954 if (Mask[i] == SM_SentinelUndef)
11958 int Val = (i & 6) + NumElts * (i & 1);
11959 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
11960 if (Mask[i] < Val || Mask[i] > Val + 1)
11961 ShufpdMask = false;
11962 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
11963 CommutableMask = false;
11964 ShuffleImm |= (Mask[i] % 2) << i;
11969 if (CommutableMask) {
11977 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11978 ArrayRef<int> Mask, SDValue V1,
11979 SDValue V2, SelectionDAG &DAG) {
11980 unsigned Immediate = 0;
11981 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
11984 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11985 DAG.getConstant(Immediate, DL, MVT::i8));
11988 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11989 ArrayRef<int> Mask, SDValue V1,
11990 SDValue V2, SelectionDAG &DAG) {
11991 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11992 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11994 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11996 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11998 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12001 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12003 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12004 /// isn't available.
12005 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12006 const SmallBitVector &Zeroable,
12007 SDValue V1, SDValue V2,
12008 const X86Subtarget &Subtarget,
12009 SelectionDAG &DAG) {
12010 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12011 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12012 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12014 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12015 Zeroable, Subtarget, DAG))
12018 if (V2.isUndef()) {
12019 // Check for being able to broadcast a single element.
12020 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12021 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12024 // Use low duplicate instructions for masks that match their pattern.
12025 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12026 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12028 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12029 // Non-half-crossing single input shuffles can be lowered with an
12030 // interleaved permutation.
12031 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12032 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12033 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12034 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12037 // With AVX2 we have direct support for this permutation.
12038 if (Subtarget.hasAVX2())
12039 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12040 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12042 // Try to create an in-lane repeating shuffle mask and then shuffle the
12043 // the results into the target lanes.
12044 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12045 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12048 // Otherwise, fall back.
12049 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12053 // Use dedicated unpack instructions for masks that match their pattern.
12055 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12058 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12059 Zeroable, Subtarget, DAG))
12062 // Check if the blend happens to exactly fit that of SHUFPD.
12064 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12067 // Try to create an in-lane repeating shuffle mask and then shuffle the
12068 // the results into the target lanes.
12069 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12070 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12073 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12074 // shuffle. However, if we have AVX2 and either inputs are already in place,
12075 // we will be able to shuffle even across lanes the other input in a single
12076 // instruction so skip this pattern.
12077 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12078 isShuffleMaskInputInPlace(1, Mask))))
12079 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12080 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12083 // If we have AVX2 then we always want to lower with a blend because an v4 we
12084 // can fully permute the elements.
12085 if (Subtarget.hasAVX2())
12086 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12089 // Otherwise fall back on generic lowering.
12090 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12093 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12095 /// This routine is only called when we have AVX2 and thus a reasonable
12096 /// instruction set for v4i64 shuffling..
12097 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12098 const SmallBitVector &Zeroable,
12099 SDValue V1, SDValue V2,
12100 const X86Subtarget &Subtarget,
12101 SelectionDAG &DAG) {
12102 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12103 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12104 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12105 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12107 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12108 Zeroable, Subtarget, DAG))
12111 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12112 Zeroable, Subtarget, DAG))
12115 // Check for being able to broadcast a single element.
12116 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12117 Mask, Subtarget, DAG))
12120 if (V2.isUndef()) {
12121 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12122 // can use lower latency instructions that will operate on both lanes.
12123 SmallVector<int, 2> RepeatedMask;
12124 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12125 SmallVector<int, 4> PSHUFDMask;
12126 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12127 return DAG.getBitcast(
12129 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12130 DAG.getBitcast(MVT::v8i32, V1),
12131 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12134 // AVX2 provides a direct instruction for permuting a single input across
12136 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12137 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12140 // Try to use shift instructions.
12141 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12142 Zeroable, Subtarget, DAG))
12145 // If we have VLX support, we can use VALIGN.
12146 if (Subtarget.hasVLX())
12147 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12148 Mask, Subtarget, DAG))
12151 // Try to use PALIGNR.
12152 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12153 Mask, Subtarget, DAG))
12156 // Use dedicated unpack instructions for masks that match their pattern.
12158 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12161 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12162 // shuffle. However, if we have AVX2 and either inputs are already in place,
12163 // we will be able to shuffle even across lanes the other input in a single
12164 // instruction so skip this pattern.
12165 if (!isShuffleMaskInputInPlace(0, Mask) &&
12166 !isShuffleMaskInputInPlace(1, Mask))
12167 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12168 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12171 // Otherwise fall back on generic blend lowering.
12172 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12176 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12178 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12179 /// isn't available.
12180 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12181 const SmallBitVector &Zeroable,
12182 SDValue V1, SDValue V2,
12183 const X86Subtarget &Subtarget,
12184 SelectionDAG &DAG) {
12185 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12186 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12187 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12189 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12190 Zeroable, Subtarget, DAG))
12193 // Check for being able to broadcast a single element.
12194 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12195 Mask, Subtarget, DAG))
12198 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12199 // options to efficiently lower the shuffle.
12200 SmallVector<int, 4> RepeatedMask;
12201 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12202 assert(RepeatedMask.size() == 4 &&
12203 "Repeated masks must be half the mask width!");
12205 // Use even/odd duplicate instructions for masks that match their pattern.
12206 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12207 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12208 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12209 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12212 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12213 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12215 // Use dedicated unpack instructions for masks that match their pattern.
12217 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12220 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12221 // have already handled any direct blends.
12222 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12225 // Try to create an in-lane repeating shuffle mask and then shuffle the
12226 // the results into the target lanes.
12227 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12228 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12231 // If we have a single input shuffle with different shuffle patterns in the
12232 // two 128-bit lanes use the variable mask to VPERMILPS.
12233 if (V2.isUndef()) {
12234 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12235 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12236 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12238 if (Subtarget.hasAVX2())
12239 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12241 // Otherwise, fall back.
12242 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12246 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12248 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12249 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12252 // If we have AVX2 then we always want to lower with a blend because at v8 we
12253 // can fully permute the elements.
12254 if (Subtarget.hasAVX2())
12255 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12258 // Otherwise fall back on generic lowering.
12259 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12262 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12264 /// This routine is only called when we have AVX2 and thus a reasonable
12265 /// instruction set for v8i32 shuffling..
12266 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12267 const SmallBitVector &Zeroable,
12268 SDValue V1, SDValue V2,
12269 const X86Subtarget &Subtarget,
12270 SelectionDAG &DAG) {
12271 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12272 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12273 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12274 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12276 // Whenever we can lower this as a zext, that instruction is strictly faster
12277 // than any alternative. It also allows us to fold memory operands into the
12278 // shuffle in many cases.
12279 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12280 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12283 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12284 Zeroable, Subtarget, DAG))
12287 // Check for being able to broadcast a single element.
12288 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12289 Mask, Subtarget, DAG))
12292 // If the shuffle mask is repeated in each 128-bit lane we can use more
12293 // efficient instructions that mirror the shuffles across the two 128-bit
12295 SmallVector<int, 4> RepeatedMask;
12296 bool Is128BitLaneRepeatedShuffle =
12297 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12298 if (Is128BitLaneRepeatedShuffle) {
12299 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12301 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12302 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12304 // Use dedicated unpack instructions for masks that match their pattern.
12306 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12310 // Try to use shift instructions.
12311 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12312 Zeroable, Subtarget, DAG))
12315 // If we have VLX support, we can use VALIGN.
12316 if (Subtarget.hasVLX())
12317 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12318 Mask, Subtarget, DAG))
12321 // Try to use byte rotation instructions.
12322 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12323 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12326 // Try to create an in-lane repeating shuffle mask and then shuffle the
12327 // results into the target lanes.
12328 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12329 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12332 // If the shuffle patterns aren't repeated but it is a single input, directly
12333 // generate a cross-lane VPERMD instruction.
12334 if (V2.isUndef()) {
12335 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12336 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12339 // Assume that a single SHUFPS is faster than an alternative sequence of
12340 // multiple instructions (even if the CPU has a domain penalty).
12341 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12342 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12343 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12344 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12345 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12346 CastV1, CastV2, DAG);
12347 return DAG.getBitcast(MVT::v8i32, ShufPS);
12350 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12352 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12353 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12356 // Otherwise fall back on generic blend lowering.
12357 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12361 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12363 /// This routine is only called when we have AVX2 and thus a reasonable
12364 /// instruction set for v16i16 shuffling..
12365 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12366 const SmallBitVector &Zeroable,
12367 SDValue V1, SDValue V2,
12368 const X86Subtarget &Subtarget,
12369 SelectionDAG &DAG) {
12370 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12371 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12372 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12373 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12375 // Whenever we can lower this as a zext, that instruction is strictly faster
12376 // than any alternative. It also allows us to fold memory operands into the
12377 // shuffle in many cases.
12378 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12379 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12382 // Check for being able to broadcast a single element.
12383 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12384 Mask, Subtarget, DAG))
12387 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12388 Zeroable, Subtarget, DAG))
12391 // Use dedicated unpack instructions for masks that match their pattern.
12393 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12396 // Try to use shift instructions.
12397 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12398 Zeroable, Subtarget, DAG))
12401 // Try to use byte rotation instructions.
12402 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12403 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12406 // Try to create an in-lane repeating shuffle mask and then shuffle the
12407 // the results into the target lanes.
12408 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12409 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12412 if (V2.isUndef()) {
12413 // There are no generalized cross-lane shuffle operations available on i16
12415 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12416 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12419 SmallVector<int, 8> RepeatedMask;
12420 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12421 // As this is a single-input shuffle, the repeated mask should be
12422 // a strictly valid v8i16 mask that we can pass through to the v8i16
12423 // lowering to handle even the v16 case.
12424 return lowerV8I16GeneralSingleInputVectorShuffle(
12425 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12429 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12430 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12433 // AVX512BWVL can lower to VPERMW.
12434 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12435 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12437 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12439 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12440 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12443 // Otherwise fall back on generic lowering.
12444 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12447 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12449 /// This routine is only called when we have AVX2 and thus a reasonable
12450 /// instruction set for v32i8 shuffling..
12451 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12452 const SmallBitVector &Zeroable,
12453 SDValue V1, SDValue V2,
12454 const X86Subtarget &Subtarget,
12455 SelectionDAG &DAG) {
12456 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12457 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12458 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12459 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12461 // Whenever we can lower this as a zext, that instruction is strictly faster
12462 // than any alternative. It also allows us to fold memory operands into the
12463 // shuffle in many cases.
12464 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12465 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12468 // Check for being able to broadcast a single element.
12469 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12470 Mask, Subtarget, DAG))
12473 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12474 Zeroable, Subtarget, DAG))
12477 // Use dedicated unpack instructions for masks that match their pattern.
12479 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12482 // Try to use shift instructions.
12483 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12484 Zeroable, Subtarget, DAG))
12487 // Try to use byte rotation instructions.
12488 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12489 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12492 // Try to create an in-lane repeating shuffle mask and then shuffle the
12493 // the results into the target lanes.
12494 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12495 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12498 // There are no generalized cross-lane shuffle operations available on i8
12500 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12501 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
12504 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12505 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12508 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12510 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12511 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12514 // Otherwise fall back on generic lowering.
12515 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
12518 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
12520 /// This routine either breaks down the specific type of a 256-bit x86 vector
12521 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
12522 /// together based on the available instructions.
12523 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12524 MVT VT, SDValue V1, SDValue V2,
12525 const SmallBitVector &Zeroable,
12526 const X86Subtarget &Subtarget,
12527 SelectionDAG &DAG) {
12528 // If we have a single input to the zero element, insert that into V1 if we
12529 // can do so cheaply.
12530 int NumElts = VT.getVectorNumElements();
12531 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12533 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12534 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12535 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12538 // Handle special cases where the lower or upper half is UNDEF.
12540 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
12543 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
12544 // can check for those subtargets here and avoid much of the subtarget
12545 // querying in the per-vector-type lowering routines. With AVX1 we have
12546 // essentially *zero* ability to manipulate a 256-bit vector with integer
12547 // types. Since we'll use floating point types there eventually, just
12548 // immediately cast everything to a float and operate entirely in that domain.
12549 if (VT.isInteger() && !Subtarget.hasAVX2()) {
12550 int ElementBits = VT.getScalarSizeInBits();
12551 if (ElementBits < 32) {
12552 // No floating point type available, if we can't use the bit operations
12553 // for masking/blending then decompose into 128-bit vectors.
12555 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
12557 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12559 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12562 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
12563 VT.getVectorNumElements());
12564 V1 = DAG.getBitcast(FpVT, V1);
12565 V2 = DAG.getBitcast(FpVT, V2);
12566 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
12569 switch (VT.SimpleTy) {
12571 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12573 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12575 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12577 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12579 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12581 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12584 llvm_unreachable("Not a valid 256-bit x86 vector type!");
12588 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
12589 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
12590 ArrayRef<int> Mask, SDValue V1,
12591 SDValue V2, SelectionDAG &DAG) {
12592 assert(VT.getScalarSizeInBits() == 64 &&
12593 "Unexpected element type size for 128bit shuffle.");
12595 // To handle 256 bit vector requires VLX and most probably
12596 // function lowerV2X128VectorShuffle() is better solution.
12597 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
12599 SmallVector<int, 4> WidenedMask;
12600 if (!canWidenShuffleElements(Mask, WidenedMask))
12603 // Check for patterns which can be matched with a single insert of a 256-bit
12605 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
12606 {0, 1, 2, 3, 0, 1, 2, 3});
12607 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
12608 {0, 1, 2, 3, 8, 9, 10, 11})) {
12609 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
12610 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12611 DAG.getIntPtrConstant(0, DL));
12612 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12613 OnlyUsesV1 ? V1 : V2,
12614 DAG.getIntPtrConstant(0, DL));
12615 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12618 assert(WidenedMask.size() == 4);
12620 // See if this is an insertion of the lower 128-bits of V2 into V1.
12621 bool IsInsert = true;
12623 for (int i = 0; i < 4; ++i) {
12624 assert(WidenedMask[i] >= -1);
12625 if (WidenedMask[i] < 0)
12628 // Make sure all V1 subvectors are in place.
12629 if (WidenedMask[i] < 4) {
12630 if (WidenedMask[i] != i) {
12635 // Make sure we only have a single V2 index and its the lowest 128-bits.
12636 if (V2Index >= 0 || WidenedMask[i] != 4) {
12643 if (IsInsert && V2Index >= 0) {
12644 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12645 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
12646 DAG.getIntPtrConstant(0, DL));
12647 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
12650 // Try to lower to to vshuf64x2/vshuf32x4.
12651 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12652 unsigned PermMask = 0;
12653 // Insure elements came from the same Op.
12654 for (int i = 0; i < 4; ++i) {
12655 assert(WidenedMask[i] >= -1);
12656 if (WidenedMask[i] < 0)
12659 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
12660 unsigned OpIndex = i / 2;
12661 if (Ops[OpIndex].isUndef())
12663 else if (Ops[OpIndex] != Op)
12666 // Convert the 128-bit shuffle mask selection values into 128-bit selection
12667 // bits defined by a vshuf64x2 instruction's immediate control byte.
12668 PermMask |= (WidenedMask[i] % 4) << (i * 2);
12671 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
12672 DAG.getConstant(PermMask, DL, MVT::i8));
12675 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
12676 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12677 SDValue V1, SDValue V2,
12678 const X86Subtarget &Subtarget,
12679 SelectionDAG &DAG) {
12680 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12681 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12682 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12684 if (V2.isUndef()) {
12685 // Use low duplicate instructions for masks that match their pattern.
12686 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
12687 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
12689 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
12690 // Non-half-crossing single input shuffles can be lowered with an
12691 // interleaved permutation.
12692 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12693 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
12694 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
12695 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
12696 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
12697 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12700 SmallVector<int, 4> RepeatedMask;
12701 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
12702 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
12703 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12706 if (SDValue Shuf128 =
12707 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
12710 if (SDValue Unpck =
12711 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
12714 // Check if the blend happens to exactly fit that of SHUFPD.
12716 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
12719 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
12722 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
12723 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
12724 SDValue V1, SDValue V2,
12725 const X86Subtarget &Subtarget,
12726 SelectionDAG &DAG) {
12727 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12728 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12729 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12731 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12732 // options to efficiently lower the shuffle.
12733 SmallVector<int, 4> RepeatedMask;
12734 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
12735 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12737 // Use even/odd duplicate instructions for masks that match their pattern.
12738 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12739 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
12740 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12741 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
12744 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
12745 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12747 // Use dedicated unpack instructions for masks that match their pattern.
12748 if (SDValue Unpck =
12749 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
12752 // Otherwise, fall back to a SHUFPS sequence.
12753 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
12756 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
12759 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
12760 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12761 const SmallBitVector &Zeroable,
12762 SDValue V1, SDValue V2,
12763 const X86Subtarget &Subtarget,
12764 SelectionDAG &DAG) {
12765 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12766 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12767 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12769 if (SDValue Shuf128 =
12770 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
12773 if (V2.isUndef()) {
12774 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12775 // can use lower latency instructions that will operate on all four
12777 SmallVector<int, 2> Repeated128Mask;
12778 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
12779 SmallVector<int, 4> PSHUFDMask;
12780 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
12781 return DAG.getBitcast(
12783 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
12784 DAG.getBitcast(MVT::v16i32, V1),
12785 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12788 SmallVector<int, 4> Repeated256Mask;
12789 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
12790 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
12791 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
12794 // Try to use shift instructions.
12795 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
12796 Zeroable, Subtarget, DAG))
12799 // Try to use VALIGN.
12800 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
12801 Mask, Subtarget, DAG))
12804 // Try to use PALIGNR.
12805 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
12806 Mask, Subtarget, DAG))
12809 if (SDValue Unpck =
12810 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
12813 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
12816 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
12817 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12818 const SmallBitVector &Zeroable,
12819 SDValue V1, SDValue V2,
12820 const X86Subtarget &Subtarget,
12821 SelectionDAG &DAG) {
12822 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12823 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12824 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12826 // Whenever we can lower this as a zext, that instruction is strictly faster
12827 // than any alternative. It also allows us to fold memory operands into the
12828 // shuffle in many cases.
12829 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12830 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12833 // If the shuffle mask is repeated in each 128-bit lane we can use more
12834 // efficient instructions that mirror the shuffles across the four 128-bit
12836 SmallVector<int, 4> RepeatedMask;
12837 bool Is128BitLaneRepeatedShuffle =
12838 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
12839 if (Is128BitLaneRepeatedShuffle) {
12840 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12842 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
12843 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12845 // Use dedicated unpack instructions for masks that match their pattern.
12847 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
12851 // Try to use shift instructions.
12852 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
12853 Zeroable, Subtarget, DAG))
12856 // Try to use VALIGN.
12857 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
12858 Mask, Subtarget, DAG))
12861 // Try to use byte rotation instructions.
12862 if (Subtarget.hasBWI())
12863 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12864 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
12867 // Assume that a single SHUFPS is faster than using a permv shuffle.
12868 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12869 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12870 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
12871 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
12872 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
12873 CastV1, CastV2, DAG);
12874 return DAG.getBitcast(MVT::v16i32, ShufPS);
12877 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
12880 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
12881 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12882 const SmallBitVector &Zeroable,
12883 SDValue V1, SDValue V2,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12887 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12888 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12889 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
12891 // Whenever we can lower this as a zext, that instruction is strictly faster
12892 // than any alternative. It also allows us to fold memory operands into the
12893 // shuffle in many cases.
12894 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12895 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12898 // Use dedicated unpack instructions for masks that match their pattern.
12900 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
12903 // Try to use shift instructions.
12904 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
12905 Zeroable, Subtarget, DAG))
12908 // Try to use byte rotation instructions.
12909 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12910 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
12913 if (V2.isUndef()) {
12914 SmallVector<int, 8> RepeatedMask;
12915 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
12916 // As this is a single-input shuffle, the repeated mask should be
12917 // a strictly valid v8i16 mask that we can pass through to the v8i16
12918 // lowering to handle even the v32 case.
12919 return lowerV8I16GeneralSingleInputVectorShuffle(
12920 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
12924 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
12927 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
12928 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12929 const SmallBitVector &Zeroable,
12930 SDValue V1, SDValue V2,
12931 const X86Subtarget &Subtarget,
12932 SelectionDAG &DAG) {
12933 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12934 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12935 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
12936 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
12938 // Whenever we can lower this as a zext, that instruction is strictly faster
12939 // than any alternative. It also allows us to fold memory operands into the
12940 // shuffle in many cases.
12941 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12942 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12945 // Use dedicated unpack instructions for masks that match their pattern.
12947 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
12950 // Try to use shift instructions.
12951 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
12952 Zeroable, Subtarget, DAG))
12955 // Try to use byte rotation instructions.
12956 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12957 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12960 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12961 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12964 // VBMI can use VPERMV/VPERMV3 byte shuffles.
12965 if (Subtarget.hasVBMI())
12966 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
12968 // FIXME: Implement direct support for this type!
12969 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12972 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12974 /// This routine either breaks down the specific type of a 512-bit x86 vector
12975 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12976 /// together based on the available instructions.
12977 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12978 MVT VT, SDValue V1, SDValue V2,
12979 const SmallBitVector &Zeroable,
12980 const X86Subtarget &Subtarget,
12981 SelectionDAG &DAG) {
12982 assert(Subtarget.hasAVX512() &&
12983 "Cannot lower 512-bit vectors w/ basic ISA!");
12985 // If we have a single input to the zero element, insert that into V1 if we
12986 // can do so cheaply.
12987 int NumElts = Mask.size();
12988 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12990 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12991 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12992 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12995 // Check for being able to broadcast a single element.
12996 if (SDValue Broadcast =
12997 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13000 // Dispatch to each element type for lowering. If we don't have support for
13001 // specific element type shuffles at 512 bits, immediately split them and
13002 // lower them. Each lowering routine of a given type is allowed to assume that
13003 // the requisite ISA extensions for that element type are available.
13004 switch (VT.SimpleTy) {
13006 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13008 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13010 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13012 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13014 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13016 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13019 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13023 // Lower vXi1 vector shuffles.
13024 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13025 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13026 // vector, shuffle and then truncate it back.
13027 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13028 MVT VT, SDValue V1, SDValue V2,
13029 const X86Subtarget &Subtarget,
13030 SelectionDAG &DAG) {
13031 assert(Subtarget.hasAVX512() &&
13032 "Cannot lower 512-bit vectors w/o basic ISA!");
13034 switch (VT.SimpleTy) {
13036 llvm_unreachable("Expected a vector of i1 elements");
13038 ExtVT = MVT::v2i64;
13041 ExtVT = MVT::v4i32;
13044 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13047 ExtVT = MVT::v16i32;
13050 ExtVT = MVT::v32i16;
13053 ExtVT = MVT::v64i8;
13057 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13058 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13059 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13060 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13062 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13065 V2 = DAG.getUNDEF(ExtVT);
13066 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13067 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13068 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13069 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13071 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13073 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13074 // i1 was sign extended we can use X86ISD::CVT2MASK.
13075 int NumElems = VT.getVectorNumElements();
13076 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13077 (Subtarget.hasDQI() && (NumElems < 32)))
13078 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13080 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13083 /// Helper function that returns true if the shuffle mask should be
13084 /// commuted to improve canonicalization.
13085 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13086 int NumElements = Mask.size();
13088 int NumV1Elements = 0, NumV2Elements = 0;
13092 else if (M < NumElements)
13097 // Commute the shuffle as needed such that more elements come from V1 than
13098 // V2. This allows us to match the shuffle pattern strictly on how many
13099 // elements come from V1 without handling the symmetric cases.
13100 if (NumV2Elements > NumV1Elements)
13103 assert(NumV1Elements > 0 && "No V1 indices");
13105 if (NumV2Elements == 0)
13108 // When the number of V1 and V2 elements are the same, try to minimize the
13109 // number of uses of V2 in the low half of the vector. When that is tied,
13110 // ensure that the sum of indices for V1 is equal to or lower than the sum
13111 // indices for V2. When those are equal, try to ensure that the number of odd
13112 // indices for V1 is lower than the number of odd indices for V2.
13113 if (NumV1Elements == NumV2Elements) {
13114 int LowV1Elements = 0, LowV2Elements = 0;
13115 for (int M : Mask.slice(0, NumElements / 2))
13116 if (M >= NumElements)
13120 if (LowV2Elements > LowV1Elements)
13122 if (LowV2Elements == LowV1Elements) {
13123 int SumV1Indices = 0, SumV2Indices = 0;
13124 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13125 if (Mask[i] >= NumElements)
13127 else if (Mask[i] >= 0)
13129 if (SumV2Indices < SumV1Indices)
13131 if (SumV2Indices == SumV1Indices) {
13132 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13133 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13134 if (Mask[i] >= NumElements)
13135 NumV2OddIndices += i % 2;
13136 else if (Mask[i] >= 0)
13137 NumV1OddIndices += i % 2;
13138 if (NumV2OddIndices < NumV1OddIndices)
13147 /// \brief Top-level lowering for x86 vector shuffles.
13149 /// This handles decomposition, canonicalization, and lowering of all x86
13150 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13151 /// above in helper routines. The canonicalization attempts to widen shuffles
13152 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13153 /// s.t. only one of the two inputs needs to be tested, etc.
13154 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13155 SelectionDAG &DAG) {
13156 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13157 ArrayRef<int> Mask = SVOp->getMask();
13158 SDValue V1 = Op.getOperand(0);
13159 SDValue V2 = Op.getOperand(1);
13160 MVT VT = Op.getSimpleValueType();
13161 int NumElements = VT.getVectorNumElements();
13163 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13165 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13166 "Can't lower MMX shuffles");
13168 bool V1IsUndef = V1.isUndef();
13169 bool V2IsUndef = V2.isUndef();
13170 if (V1IsUndef && V2IsUndef)
13171 return DAG.getUNDEF(VT);
13173 // When we create a shuffle node we put the UNDEF node to second operand,
13174 // but in some cases the first operand may be transformed to UNDEF.
13175 // In this case we should just commute the node.
13177 return DAG.getCommutedVectorShuffle(*SVOp);
13179 // Check for non-undef masks pointing at an undef vector and make the masks
13180 // undef as well. This makes it easier to match the shuffle based solely on
13184 if (M >= NumElements) {
13185 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13186 for (int &M : NewMask)
13187 if (M >= NumElements)
13189 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13192 // Check for illegal shuffle mask element index values.
13193 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13194 assert(llvm::all_of(Mask,
13195 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13196 "Out of bounds shuffle index");
13198 // We actually see shuffles that are entirely re-arrangements of a set of
13199 // zero inputs. This mostly happens while decomposing complex shuffles into
13200 // simple ones. Directly lower these as a buildvector of zeros.
13201 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13202 if (Zeroable.all())
13203 return getZeroVector(VT, Subtarget, DAG, DL);
13205 // Try to collapse shuffles into using a vector type with fewer elements but
13206 // wider element types. We cap this to not form integers or floating point
13207 // elements wider than 64 bits, but it might be interesting to form i128
13208 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13209 SmallVector<int, 16> WidenedMask;
13210 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13211 canWidenShuffleElements(Mask, WidenedMask)) {
13212 MVT NewEltVT = VT.isFloatingPoint()
13213 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13214 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13215 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13216 // Make sure that the new vector type is legal. For example, v2f64 isn't
13218 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13219 V1 = DAG.getBitcast(NewVT, V1);
13220 V2 = DAG.getBitcast(NewVT, V2);
13221 return DAG.getBitcast(
13222 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13226 // Commute the shuffle if it will improve canonicalization.
13227 if (canonicalizeShuffleMaskWithCommute(Mask))
13228 return DAG.getCommutedVectorShuffle(*SVOp);
13230 // For each vector width, delegate to a specialized lowering routine.
13231 if (VT.is128BitVector())
13232 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13235 if (VT.is256BitVector())
13236 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13239 if (VT.is512BitVector())
13240 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13244 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13246 llvm_unreachable("Unimplemented!");
13249 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13250 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13251 const X86Subtarget &Subtarget,
13252 SelectionDAG &DAG) {
13253 SDValue Cond = Op.getOperand(0);
13254 SDValue LHS = Op.getOperand(1);
13255 SDValue RHS = Op.getOperand(2);
13257 MVT VT = Op.getSimpleValueType();
13259 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13261 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13263 // Only non-legal VSELECTs reach this lowering, convert those into generic
13264 // shuffles and re-use the shuffle lowering path for blends.
13265 SmallVector<int, 32> Mask;
13266 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13267 SDValue CondElt = CondBV->getOperand(i);
13269 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13272 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13275 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13276 // A vselect where all conditions and data are constants can be optimized into
13277 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13278 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13279 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13280 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13283 // Try to lower this to a blend-style vector shuffle. This can handle all
13284 // constant condition cases.
13285 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13288 // Variable blends are only legal from SSE4.1 onward.
13289 if (!Subtarget.hasSSE41())
13292 // Only some types will be legal on some subtargets. If we can emit a legal
13293 // VSELECT-matching blend, return Op, and but if we need to expand, return
13295 switch (Op.getSimpleValueType().SimpleTy) {
13297 // Most of the vector types have blends past SSE4.1.
13301 // The byte blends for AVX vectors were introduced only in AVX2.
13302 if (Subtarget.hasAVX2())
13309 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13310 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13313 // FIXME: We should custom lower this by fixing the condition and using i8
13319 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13320 MVT VT = Op.getSimpleValueType();
13323 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13326 if (VT.getSizeInBits() == 8) {
13327 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13328 Op.getOperand(0), Op.getOperand(1));
13329 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13330 DAG.getValueType(VT));
13331 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13334 if (VT == MVT::f32) {
13335 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13336 // the result back to FR32 register. It's only worth matching if the
13337 // result has a single use which is a store or a bitcast to i32. And in
13338 // the case of a store, it's not worth it if the index is a constant 0,
13339 // because a MOVSSmr can be used instead, which is smaller and faster.
13340 if (!Op.hasOneUse())
13342 SDNode *User = *Op.getNode()->use_begin();
13343 if ((User->getOpcode() != ISD::STORE ||
13344 isNullConstant(Op.getOperand(1))) &&
13345 (User->getOpcode() != ISD::BITCAST ||
13346 User->getValueType(0) != MVT::i32))
13348 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13349 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13351 return DAG.getBitcast(MVT::f32, Extract);
13354 if (VT == MVT::i32 || VT == MVT::i64) {
13355 // ExtractPS/pextrq works with constant index.
13356 if (isa<ConstantSDNode>(Op.getOperand(1)))
13363 /// Extract one bit from mask vector, like v16i1 or v8i1.
13364 /// AVX-512 feature.
13366 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13367 SDValue Vec = Op.getOperand(0);
13369 MVT VecVT = Vec.getSimpleValueType();
13370 SDValue Idx = Op.getOperand(1);
13371 MVT EltVT = Op.getSimpleValueType();
13373 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13374 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13375 "Unexpected vector type in ExtractBitFromMaskVector");
13377 // variable index can't be handled in mask registers,
13378 // extend vector to VR512
13379 if (!isa<ConstantSDNode>(Idx)) {
13380 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13381 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13382 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13383 ExtVT.getVectorElementType(), Ext, Idx);
13384 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13387 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13388 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13389 (VecVT.getVectorNumElements() < 8)) {
13390 // Use kshiftlw/rw instruction.
13391 VecVT = MVT::v16i1;
13392 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13393 DAG.getUNDEF(VecVT),
13395 DAG.getIntPtrConstant(0, dl));
13397 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13398 if (MaxSift - IdxVal)
13399 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13400 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13401 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13402 DAG.getConstant(MaxSift, dl, MVT::i8));
13403 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13404 DAG.getIntPtrConstant(0, dl));
13408 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13409 SelectionDAG &DAG) const {
13411 SDValue Vec = Op.getOperand(0);
13412 MVT VecVT = Vec.getSimpleValueType();
13413 SDValue Idx = Op.getOperand(1);
13415 if (Op.getSimpleValueType() == MVT::i1)
13416 return ExtractBitFromMaskVector(Op, DAG);
13418 if (!isa<ConstantSDNode>(Idx)) {
13419 if (VecVT.is512BitVector() ||
13420 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
13421 VecVT.getScalarSizeInBits() == 32)) {
13424 MVT::getIntegerVT(VecVT.getScalarSizeInBits());
13425 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13426 MaskEltVT.getSizeInBits());
13428 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13429 auto PtrVT = getPointerTy(DAG.getDataLayout());
13430 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13431 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
13432 DAG.getConstant(0, dl, PtrVT));
13433 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13434 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
13435 DAG.getConstant(0, dl, PtrVT));
13440 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13442 // If this is a 256-bit vector result, first extract the 128-bit vector and
13443 // then extract the element from the 128-bit vector.
13444 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13445 // Get the 128-bit vector.
13446 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
13447 MVT EltVT = VecVT.getVectorElementType();
13449 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13450 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
13452 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
13453 // this can be done with a mask.
13454 IdxVal &= ElemsPerChunk - 1;
13455 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13456 DAG.getConstant(IdxVal, dl, MVT::i32));
13459 assert(VecVT.is128BitVector() && "Unexpected vector length");
13461 MVT VT = Op.getSimpleValueType();
13463 if (VT.getSizeInBits() == 16) {
13464 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
13465 // we're going to zero extend the register or fold the store (SSE41 only).
13466 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
13467 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
13468 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13469 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13470 DAG.getBitcast(MVT::v4i32, Vec), Idx));
13472 // Transform it so it match pextrw which produces a 32-bit result.
13473 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13474 Op.getOperand(0), Op.getOperand(1));
13475 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13476 DAG.getValueType(VT));
13477 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13480 if (Subtarget.hasSSE41())
13481 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
13484 // TODO: handle v16i8.
13486 if (VT.getSizeInBits() == 32) {
13490 // SHUFPS the element to the lowest double word, then movss.
13491 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
13492 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13494 DAG.getIntPtrConstant(0, dl));
13497 if (VT.getSizeInBits() == 64) {
13498 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13499 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13500 // to match extract_elt for f64.
13504 // UNPCKHPD the element to the lowest double word, then movsd.
13505 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13506 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13507 int Mask[2] = { 1, -1 };
13508 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13509 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13510 DAG.getIntPtrConstant(0, dl));
13516 /// Insert one bit to mask vector, like v16i1 or v8i1.
13517 /// AVX-512 feature.
13519 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13521 SDValue Vec = Op.getOperand(0);
13522 SDValue Elt = Op.getOperand(1);
13523 SDValue Idx = Op.getOperand(2);
13524 MVT VecVT = Vec.getSimpleValueType();
13526 if (!isa<ConstantSDNode>(Idx)) {
13527 // Non constant index. Extend source and destination,
13528 // insert element and then truncate the result.
13529 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13530 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13531 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13532 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13533 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13534 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13537 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13538 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13539 unsigned NumElems = VecVT.getVectorNumElements();
13541 if(Vec.isUndef()) {
13543 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13544 DAG.getConstant(IdxVal, dl, MVT::i8));
13548 // Insertion of one bit into first or last position
13549 // can be done with two SHIFTs + OR.
13550 if (IdxVal == 0 ) {
13551 // EltInVec already at correct index and other bits are 0.
13552 // Clean the first bit in source vector.
13553 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13554 DAG.getConstant(1 , dl, MVT::i8));
13555 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13556 DAG.getConstant(1, dl, MVT::i8));
13558 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13560 if (IdxVal == NumElems -1) {
13561 // Move the bit to the last position inside the vector.
13562 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13563 DAG.getConstant(IdxVal, dl, MVT::i8));
13564 // Clean the last bit in the source vector.
13565 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13566 DAG.getConstant(1, dl, MVT::i8));
13567 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13568 DAG.getConstant(1 , dl, MVT::i8));
13570 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13573 // Use shuffle to insert element.
13574 SmallVector<int, 64> MaskVec(NumElems);
13575 for (unsigned i = 0; i != NumElems; ++i)
13576 MaskVec[i] = (i == IdxVal) ? NumElems : i;
13578 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
13581 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13582 SelectionDAG &DAG) const {
13583 MVT VT = Op.getSimpleValueType();
13584 MVT EltVT = VT.getVectorElementType();
13585 unsigned NumElts = VT.getVectorNumElements();
13587 if (EltVT == MVT::i1)
13588 return InsertBitToMaskVector(Op, DAG);
13591 SDValue N0 = Op.getOperand(0);
13592 SDValue N1 = Op.getOperand(1);
13593 SDValue N2 = Op.getOperand(2);
13594 if (!isa<ConstantSDNode>(N2))
13596 auto *N2C = cast<ConstantSDNode>(N2);
13597 unsigned IdxVal = N2C->getZExtValue();
13599 // If we are clearing out a element, we do this more efficiently with a
13600 // blend shuffle than a costly integer insertion.
13601 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
13602 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
13603 // be beneficial if we are inserting several zeros and can combine the masks.
13604 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
13605 SmallVector<int, 8> ClearMask;
13606 for (unsigned i = 0; i != NumElts; ++i)
13607 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
13608 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
13609 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
13612 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13613 // into that, and then insert the subvector back into the result.
13614 if (VT.is256BitVector() || VT.is512BitVector()) {
13615 // With a 256-bit vector, we can insert into the zero element efficiently
13616 // using a blend if we have AVX or AVX2 and the right data type.
13617 if (VT.is256BitVector() && IdxVal == 0) {
13618 // TODO: It is worthwhile to cast integer to floating point and back
13619 // and incur a domain crossing penalty if that's what we'll end up
13620 // doing anyway after extracting to a 128-bit vector.
13621 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13622 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
13623 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
13624 N2 = DAG.getIntPtrConstant(1, dl);
13625 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
13629 // Get the desired 128-bit vector chunk.
13630 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
13632 // Insert the element into the desired chunk.
13633 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13634 assert(isPowerOf2_32(NumEltsIn128));
13635 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
13636 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
13638 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13639 DAG.getConstant(IdxIn128, dl, MVT::i32));
13641 // Insert the changed part back into the bigger vector
13642 return insert128BitVector(N0, V, IdxVal, DAG, dl);
13644 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13646 if (Subtarget.hasSSE41()) {
13647 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13649 if (VT == MVT::v8i16) {
13650 Opc = X86ISD::PINSRW;
13652 assert(VT == MVT::v16i8);
13653 Opc = X86ISD::PINSRB;
13656 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13658 if (N1.getValueType() != MVT::i32)
13659 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13660 if (N2.getValueType() != MVT::i32)
13661 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13662 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13665 if (EltVT == MVT::f32) {
13666 // Bits [7:6] of the constant are the source select. This will always be
13667 // zero here. The DAG Combiner may combine an extract_elt index into
13668 // these bits. For example (insert (extract, 3), 2) could be matched by
13669 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
13670 // Bits [5:4] of the constant are the destination select. This is the
13671 // value of the incoming immediate.
13672 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13673 // combine either bitwise AND or insert of float 0.0 to set these bits.
13675 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
13676 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
13677 // If this is an insertion of 32-bits into the low 32-bits of
13678 // a vector, we prefer to generate a blend with immediate rather
13679 // than an insertps. Blends are simpler operations in hardware and so
13680 // will always have equal or better performance than insertps.
13681 // But if optimizing for size and there's a load folding opportunity,
13682 // generate insertps because blendps does not have a 32-bit memory
13684 N2 = DAG.getIntPtrConstant(1, dl);
13685 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13686 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
13688 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
13689 // Create this as a scalar to vector..
13690 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13691 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13694 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13695 // PINSR* works with constant index.
13700 if (EltVT == MVT::i8)
13703 if (EltVT.getSizeInBits() == 16) {
13704 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13705 // as its second argument.
13706 if (N1.getValueType() != MVT::i32)
13707 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13708 if (N2.getValueType() != MVT::i32)
13709 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13710 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13715 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13717 MVT OpVT = Op.getSimpleValueType();
13719 // If this is a 256-bit vector result, first insert into a 128-bit
13720 // vector and then insert into the 256-bit vector.
13721 if (!OpVT.is128BitVector()) {
13722 // Insert into a 128-bit vector.
13723 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13724 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13725 OpVT.getVectorNumElements() / SizeFactor);
13727 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13729 // Insert the 128-bit vector.
13730 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13733 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13734 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13735 return DAG.getBitcast(
13736 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
13739 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13740 // a simple subregister reference or explicit instructions to grab
13741 // upper bits of a vector.
13742 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13743 SelectionDAG &DAG) {
13744 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
13747 SDValue In = Op.getOperand(0);
13748 SDValue Idx = Op.getOperand(1);
13749 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13750 MVT ResVT = Op.getSimpleValueType();
13752 assert((In.getSimpleValueType().is256BitVector() ||
13753 In.getSimpleValueType().is512BitVector()) &&
13754 "Can only extract from 256-bit or 512-bit vectors");
13756 if (ResVT.is128BitVector())
13757 return extract128BitVector(In, IdxVal, DAG, dl);
13758 if (ResVT.is256BitVector())
13759 return extract256BitVector(In, IdxVal, DAG, dl);
13761 llvm_unreachable("Unimplemented!");
13764 static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
13765 for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
13766 if (llvm::all_of(ValidUsers,
13767 [&I](SDValue V) { return V.getNode() != *I; }))
13772 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13773 // simple superregister reference or explicit instructions to insert
13774 // the upper bits of a vector.
13775 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13776 SelectionDAG &DAG) {
13777 assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
13780 SDValue Vec = Op.getOperand(0);
13781 SDValue SubVec = Op.getOperand(1);
13782 SDValue Idx = Op.getOperand(2);
13784 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13785 MVT OpVT = Op.getSimpleValueType();
13786 MVT SubVecVT = SubVec.getSimpleValueType();
13788 if (OpVT.getVectorElementType() == MVT::i1)
13789 return insert1BitVector(Op, DAG, Subtarget);
13791 assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13792 "Can only insert into 256-bit or 512-bit vectors");
13794 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
13796 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13797 // (load16 addr + 16), Elts/2)
13800 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13801 // (load32 addr + 32), Elts/2)
13803 // or a 16-byte or 32-byte broadcast:
13804 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13805 // (load16 addr), Elts/2)
13806 // --> X86SubVBroadcast(load16 addr)
13808 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13809 // (load32 addr), Elts/2)
13810 // --> X86SubVBroadcast(load32 addr)
13811 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13812 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13813 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
13814 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
13815 if (Idx2 && Idx2->getZExtValue() == 0) {
13816 SDValue SubVec2 = Vec.getOperand(1);
13817 // If needed, look through bitcasts to get to the load.
13818 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
13820 unsigned Alignment = FirstLd->getAlignment();
13821 unsigned AS = FirstLd->getAddressSpace();
13822 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
13823 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
13824 OpVT, AS, Alignment, &Fast) && Fast) {
13825 SDValue Ops[] = {SubVec2, SubVec};
13826 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
13830 // If lower/upper loads are the same and the only users of the load, then
13831 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
13832 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
13833 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
13834 areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
13835 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
13838 // If this is subv_broadcast insert into both halves, use a larger
13840 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
13841 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
13842 SubVec.getOperand(0));
13847 if (SubVecVT.is128BitVector())
13848 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13850 if (SubVecVT.is256BitVector())
13851 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13853 llvm_unreachable("Unimplemented!");
13856 // Returns the appropriate wrapper opcode for a global reference.
13857 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
13858 // References to absolute symbols are never PC-relative.
13859 if (GV && GV->isAbsoluteSymbolRef())
13860 return X86ISD::Wrapper;
13862 CodeModel::Model M = getTargetMachine().getCodeModel();
13863 if (Subtarget.isPICStyleRIPRel() &&
13864 (M == CodeModel::Small || M == CodeModel::Kernel))
13865 return X86ISD::WrapperRIP;
13867 return X86ISD::Wrapper;
13870 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13871 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13872 // one of the above mentioned nodes. It has to be wrapped because otherwise
13873 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13874 // be used to form addressing mode. These wrapped nodes will be selected
13877 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13878 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13880 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13881 // global base reg.
13882 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13884 auto PtrVT = getPointerTy(DAG.getDataLayout());
13885 SDValue Result = DAG.getTargetConstantPool(
13886 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
13888 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13889 // With PIC, the address is actually $g + Offset.
13892 DAG.getNode(ISD::ADD, DL, PtrVT,
13893 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13899 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13900 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13902 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13903 // global base reg.
13904 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13906 auto PtrVT = getPointerTy(DAG.getDataLayout());
13907 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
13909 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13911 // With PIC, the address is actually $g + Offset.
13914 DAG.getNode(ISD::ADD, DL, PtrVT,
13915 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13921 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13922 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13924 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13925 // global base reg.
13926 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
13927 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
13929 auto PtrVT = getPointerTy(DAG.getDataLayout());
13930 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
13933 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13935 // With PIC, the address is actually $g + Offset.
13936 if (isPositionIndependent() && !Subtarget.is64Bit()) {
13938 DAG.getNode(ISD::ADD, DL, PtrVT,
13939 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13942 // For symbols that require a load from a stub to get the address, emit the
13944 if (isGlobalStubReference(OpFlag))
13945 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
13946 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13952 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13953 // Create the TargetBlockAddressAddress node.
13954 unsigned char OpFlags =
13955 Subtarget.classifyBlockAddressReference();
13956 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13957 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13959 auto PtrVT = getPointerTy(DAG.getDataLayout());
13960 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
13961 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
13963 // With PIC, the address is actually $g + Offset.
13964 if (isGlobalRelativeToPICBase(OpFlags)) {
13965 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
13966 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
13972 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
13973 const SDLoc &dl, int64_t Offset,
13974 SelectionDAG &DAG) const {
13975 // Create the TargetGlobalAddress node, folding in the constant
13976 // offset if it is legal.
13977 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
13978 CodeModel::Model M = DAG.getTarget().getCodeModel();
13979 auto PtrVT = getPointerTy(DAG.getDataLayout());
13981 if (OpFlags == X86II::MO_NO_FLAG &&
13982 X86::isOffsetSuitableForCodeModel(Offset, M)) {
13983 // A direct static reference to a global.
13984 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
13987 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
13990 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
13992 // With PIC, the address is actually $g + Offset.
13993 if (isGlobalRelativeToPICBase(OpFlags)) {
13994 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
13995 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
13998 // For globals that require a load from a stub to get the address, emit the
14000 if (isGlobalStubReference(OpFlags))
14001 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14002 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14004 // If there was a non-zero offset that we didn't fold, create an explicit
14005 // addition for it.
14007 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14008 DAG.getConstant(Offset, dl, PtrVT));
14014 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14015 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14016 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14017 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14021 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14022 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14023 unsigned char OperandFlags, bool LocalDynamic = false) {
14024 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14025 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14027 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14028 GA->getValueType(0),
14032 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14036 SDValue Ops[] = { Chain, TGA, *InFlag };
14037 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14039 SDValue Ops[] = { Chain, TGA };
14040 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14043 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14044 MFI.setAdjustsStack(true);
14045 MFI.setHasCalls(true);
14047 SDValue Flag = Chain.getValue(1);
14048 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14051 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14053 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14056 SDLoc dl(GA); // ? function entry point might be better
14057 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14058 DAG.getNode(X86ISD::GlobalBaseReg,
14059 SDLoc(), PtrVT), InFlag);
14060 InFlag = Chain.getValue(1);
14062 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14065 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14067 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14069 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14070 X86::RAX, X86II::MO_TLSGD);
14073 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14079 // Get the start address of the TLS block for this module.
14080 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14081 .getInfo<X86MachineFunctionInfo>();
14082 MFI->incNumLocalDynamicTLSAccesses();
14086 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14087 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14090 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14091 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14092 InFlag = Chain.getValue(1);
14093 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14094 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14097 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14101 unsigned char OperandFlags = X86II::MO_DTPOFF;
14102 unsigned WrapperKind = X86ISD::Wrapper;
14103 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14104 GA->getValueType(0),
14105 GA->getOffset(), OperandFlags);
14106 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14108 // Add x@dtpoff with the base.
14109 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14112 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14113 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14114 const EVT PtrVT, TLSModel::Model model,
14115 bool is64Bit, bool isPIC) {
14118 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14119 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14120 is64Bit ? 257 : 256));
14122 SDValue ThreadPointer =
14123 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14124 MachinePointerInfo(Ptr));
14126 unsigned char OperandFlags = 0;
14127 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14129 unsigned WrapperKind = X86ISD::Wrapper;
14130 if (model == TLSModel::LocalExec) {
14131 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14132 } else if (model == TLSModel::InitialExec) {
14134 OperandFlags = X86II::MO_GOTTPOFF;
14135 WrapperKind = X86ISD::WrapperRIP;
14137 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14140 llvm_unreachable("Unexpected model");
14143 // emit "addl x@ntpoff,%eax" (local exec)
14144 // or "addl x@indntpoff,%eax" (initial exec)
14145 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14147 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14148 GA->getOffset(), OperandFlags);
14149 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14151 if (model == TLSModel::InitialExec) {
14152 if (isPIC && !is64Bit) {
14153 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14154 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14158 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14159 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14162 // The address of the thread local variable is the add of the thread
14163 // pointer with the offset of the variable.
14164 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14168 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14170 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14172 if (DAG.getTarget().Options.EmulatedTLS)
14173 return LowerToTLSEmulatedModel(GA, DAG);
14175 const GlobalValue *GV = GA->getGlobal();
14176 auto PtrVT = getPointerTy(DAG.getDataLayout());
14177 bool PositionIndependent = isPositionIndependent();
14179 if (Subtarget.isTargetELF()) {
14180 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14182 case TLSModel::GeneralDynamic:
14183 if (Subtarget.is64Bit())
14184 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14185 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14186 case TLSModel::LocalDynamic:
14187 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14188 Subtarget.is64Bit());
14189 case TLSModel::InitialExec:
14190 case TLSModel::LocalExec:
14191 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14192 PositionIndependent);
14194 llvm_unreachable("Unknown TLS model.");
14197 if (Subtarget.isTargetDarwin()) {
14198 // Darwin only has one model of TLS. Lower to that.
14199 unsigned char OpFlag = 0;
14200 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14201 X86ISD::WrapperRIP : X86ISD::Wrapper;
14203 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14204 // global base reg.
14205 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14207 OpFlag = X86II::MO_TLVP_PIC_BASE;
14209 OpFlag = X86II::MO_TLVP;
14211 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14212 GA->getValueType(0),
14213 GA->getOffset(), OpFlag);
14214 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14216 // With PIC32, the address is actually $g + Offset.
14218 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14219 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14222 // Lowering the machine isd will make sure everything is in the right
14224 SDValue Chain = DAG.getEntryNode();
14225 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14226 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14227 SDValue Args[] = { Chain, Offset };
14228 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14229 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14230 DAG.getIntPtrConstant(0, DL, true),
14231 Chain.getValue(1), DL);
14233 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14234 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14235 MFI.setAdjustsStack(true);
14237 // And our return value (tls address) is in the standard call return value
14239 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14240 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14243 if (Subtarget.isTargetKnownWindowsMSVC() ||
14244 Subtarget.isTargetWindowsItanium() ||
14245 Subtarget.isTargetWindowsGNU()) {
14246 // Just use the implicit TLS architecture
14247 // Need to generate someting similar to:
14248 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14250 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14251 // mov rcx, qword [rdx+rcx*8]
14252 // mov eax, .tls$:tlsvar
14253 // [rax+rcx] contains the address
14254 // Windows 64bit: gs:0x58
14255 // Windows 32bit: fs:__tls_array
14258 SDValue Chain = DAG.getEntryNode();
14260 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14261 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14262 // use its literal value of 0x2C.
14263 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14264 ? Type::getInt8PtrTy(*DAG.getContext(),
14266 : Type::getInt32PtrTy(*DAG.getContext(),
14269 SDValue TlsArray = Subtarget.is64Bit()
14270 ? DAG.getIntPtrConstant(0x58, dl)
14271 : (Subtarget.isTargetWindowsGNU()
14272 ? DAG.getIntPtrConstant(0x2C, dl)
14273 : DAG.getExternalSymbol("_tls_array", PtrVT));
14275 SDValue ThreadPointer =
14276 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14279 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14280 res = ThreadPointer;
14282 // Load the _tls_index variable
14283 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14284 if (Subtarget.is64Bit())
14285 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14286 MachinePointerInfo(), MVT::i32);
14288 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14290 auto &DL = DAG.getDataLayout();
14292 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14293 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14295 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14298 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14300 // Get the offset of start of .tls section
14301 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14302 GA->getValueType(0),
14303 GA->getOffset(), X86II::MO_SECREL);
14304 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14306 // The address of the thread local variable is the add of the thread
14307 // pointer with the offset of the variable.
14308 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14311 llvm_unreachable("TLS not implemented for this target.");
14314 /// Lower SRA_PARTS and friends, which return two i32 values
14315 /// and take a 2 x i32 value to shift plus a shift amount.
14316 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14317 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14318 MVT VT = Op.getSimpleValueType();
14319 unsigned VTBits = VT.getSizeInBits();
14321 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14322 SDValue ShOpLo = Op.getOperand(0);
14323 SDValue ShOpHi = Op.getOperand(1);
14324 SDValue ShAmt = Op.getOperand(2);
14325 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14326 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14328 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14329 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14330 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14331 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14332 : DAG.getConstant(0, dl, VT);
14334 SDValue Tmp2, Tmp3;
14335 if (Op.getOpcode() == ISD::SHL_PARTS) {
14336 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14337 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14339 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14340 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14343 // If the shift amount is larger or equal than the width of a part we can't
14344 // rely on the results of shld/shrd. Insert a test and select the appropriate
14345 // values for large shift amounts.
14346 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14347 DAG.getConstant(VTBits, dl, MVT::i8));
14348 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14349 AndNode, DAG.getConstant(0, dl, MVT::i8));
14352 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14353 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14354 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14356 if (Op.getOpcode() == ISD::SHL_PARTS) {
14357 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14358 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14360 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14361 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14364 SDValue Ops[2] = { Lo, Hi };
14365 return DAG.getMergeValues(Ops, dl);
14368 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14369 SelectionDAG &DAG) const {
14370 SDValue Src = Op.getOperand(0);
14371 MVT SrcVT = Src.getSimpleValueType();
14372 MVT VT = Op.getSimpleValueType();
14375 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14376 if (SrcVT.isVector()) {
14377 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14378 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14379 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14380 DAG.getUNDEF(SrcVT)));
14382 if (SrcVT.getVectorElementType() == MVT::i1) {
14383 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14384 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14385 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14386 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14387 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14388 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14393 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14394 "Unknown SINT_TO_FP to lower!");
14396 // These are really Legal; return the operand so the caller accepts it as
14398 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14400 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14401 Subtarget.is64Bit()) {
14405 SDValue ValueToStore = Op.getOperand(0);
14406 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14407 !Subtarget.is64Bit())
14408 // Bitcasting to f64 here allows us to do a single 64-bit store from
14409 // an SSE register, avoiding the store forwarding penalty that would come
14410 // with two 32-bit stores.
14411 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14413 unsigned Size = SrcVT.getSizeInBits()/8;
14414 MachineFunction &MF = DAG.getMachineFunction();
14415 auto PtrVT = getPointerTy(MF.getDataLayout());
14416 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14417 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14418 SDValue Chain = DAG.getStore(
14419 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14420 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14421 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14424 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14426 SelectionDAG &DAG) const {
14430 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14432 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14434 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14436 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14438 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14439 MachineMemOperand *MMO;
14441 int SSFI = FI->getIndex();
14442 MMO = DAG.getMachineFunction().getMachineMemOperand(
14443 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14444 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14446 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14447 StackSlot = StackSlot.getOperand(1);
14449 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14450 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14452 Tys, Ops, SrcVT, MMO);
14455 Chain = Result.getValue(1);
14456 SDValue InFlag = Result.getValue(2);
14458 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14459 // shouldn't be necessary except that RFP cannot be live across
14460 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14461 MachineFunction &MF = DAG.getMachineFunction();
14462 unsigned SSFISize = Op.getValueSizeInBits()/8;
14463 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
14464 auto PtrVT = getPointerTy(MF.getDataLayout());
14465 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14466 Tys = DAG.getVTList(MVT::Other);
14468 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14470 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14471 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14472 MachineMemOperand::MOStore, SSFISize, SSFISize);
14474 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14475 Ops, Op.getValueType(), MMO);
14476 Result = DAG.getLoad(
14477 Op.getValueType(), DL, Chain, StackSlot,
14478 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14484 /// 64-bit unsigned integer to double expansion.
14485 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14486 SelectionDAG &DAG) const {
14487 // This algorithm is not obvious. Here it is what we're trying to output:
14490 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14491 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14493 haddpd %xmm0, %xmm0
14495 pshufd $0x4e, %xmm0, %xmm1
14501 LLVMContext *Context = DAG.getContext();
14503 // Build some magic constants.
14504 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14505 Constant *C0 = ConstantDataVector::get(*Context, CV0);
14506 auto PtrVT = getPointerTy(DAG.getDataLayout());
14507 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
14509 SmallVector<Constant*,2> CV1;
14511 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14512 APInt(64, 0x4330000000000000ULL))));
14514 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14515 APInt(64, 0x4530000000000000ULL))));
14516 Constant *C1 = ConstantVector::get(CV1);
14517 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
14519 // Load the 64-bit value into an XMM register.
14520 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14523 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14524 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14525 /* Alignment = */ 16);
14527 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
14530 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14531 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14532 /* Alignment = */ 16);
14533 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
14534 // TODO: Are there any fast-math-flags to propagate here?
14535 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14538 if (Subtarget.hasSSE3()) {
14539 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14540 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14542 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
14543 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
14544 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14545 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
14548 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14549 DAG.getIntPtrConstant(0, dl));
14552 /// 32-bit unsigned integer to float expansion.
14553 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14554 SelectionDAG &DAG) const {
14556 // FP constant to bias correct the final result.
14557 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
14560 // Load the 32-bit value into an XMM register.
14561 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14564 // Zero out the upper parts of the register.
14565 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14567 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14568 DAG.getBitcast(MVT::v2f64, Load),
14569 DAG.getIntPtrConstant(0, dl));
14571 // Or the load with the bias.
14572 SDValue Or = DAG.getNode(
14573 ISD::OR, dl, MVT::v2i64,
14574 DAG.getBitcast(MVT::v2i64,
14575 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
14576 DAG.getBitcast(MVT::v2i64,
14577 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
14579 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14580 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
14582 // Subtract the bias.
14583 // TODO: Are there any fast-math-flags to propagate here?
14584 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14586 // Handle final rounding.
14587 MVT DestVT = Op.getSimpleValueType();
14589 if (DestVT.bitsLT(MVT::f64))
14590 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14591 DAG.getIntPtrConstant(0, dl));
14592 if (DestVT.bitsGT(MVT::f64))
14593 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14595 // Handle final rounding.
14599 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
14600 const X86Subtarget &Subtarget, SDLoc &DL) {
14601 if (Op.getSimpleValueType() != MVT::v2f64)
14604 SDValue N0 = Op.getOperand(0);
14605 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
14607 // Legalize to v4i32 type.
14608 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
14609 DAG.getUNDEF(MVT::v2i32));
14611 if (Subtarget.hasAVX512())
14612 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
14614 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
14615 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
14616 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
14617 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
14619 // Two to the power of half-word-size.
14620 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
14622 // Clear upper part of LO, lower HI.
14623 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
14624 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
14626 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
14627 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
14628 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
14630 // Add the two halves.
14631 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
14634 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14635 const X86Subtarget &Subtarget) {
14636 // The algorithm is the following:
14637 // #ifdef __SSE4_1__
14638 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14639 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14640 // (uint4) 0x53000000, 0xaa);
14642 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14643 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14645 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14646 // return (float4) lo + fhi;
14648 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
14649 // reassociate the two FADDs, and if we do that, the algorithm fails
14650 // spectacularly (PR24512).
14651 // FIXME: If we ever have some kind of Machine FMF, this should be marked
14652 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
14653 // there's also the MachineCombiner reassociations happening on Machine IR.
14654 if (DAG.getTarget().Options.UnsafeFPMath)
14658 SDValue V = Op->getOperand(0);
14659 MVT VecIntVT = V.getSimpleValueType();
14660 bool Is128 = VecIntVT == MVT::v4i32;
14661 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14662 // If we convert to something else than the supported type, e.g., to v4f64,
14664 if (VecFloatVT != Op->getSimpleValueType(0))
14667 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14668 "Unsupported custom type");
14670 // In the #idef/#else code, we have in common:
14671 // - The vector of constants:
14677 // Create the splat vector for 0x4b000000.
14678 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
14679 // Create the splat vector for 0x53000000.
14680 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
14682 // Create the right shift.
14683 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
14684 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14687 if (Subtarget.hasSSE41()) {
14688 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14689 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14690 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
14691 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
14692 // Low will be bitcasted right away, so do not bother bitcasting back to its
14694 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14695 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14696 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14697 // (uint4) 0x53000000, 0xaa);
14698 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
14699 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
14700 // High will be bitcasted right away, so do not bother bitcasting back to
14701 // its original type.
14702 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14703 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14705 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
14706 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14707 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14708 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14710 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14711 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14714 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14715 SDValue VecCstFAdd = DAG.getConstantFP(
14716 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
14718 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14719 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
14720 // TODO: Are there any fast-math-flags to propagate here?
14722 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14723 // return (float4) lo + fhi;
14724 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
14725 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14728 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14729 SelectionDAG &DAG) const {
14730 SDValue N0 = Op.getOperand(0);
14731 MVT SrcVT = N0.getSimpleValueType();
14734 if (SrcVT.getVectorElementType() == MVT::i1) {
14735 if (SrcVT == MVT::v2i1)
14736 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14737 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
14738 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14739 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14740 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
14743 switch (SrcVT.SimpleTy) {
14745 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14750 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14751 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14752 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14755 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
14758 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
14761 assert(Subtarget.hasAVX512());
14762 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14763 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
14767 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14768 SelectionDAG &DAG) const {
14769 SDValue N0 = Op.getOperand(0);
14771 auto PtrVT = getPointerTy(DAG.getDataLayout());
14773 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14774 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14775 // the optimization here.
14776 if (DAG.SignBitIsZero(N0))
14777 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14779 if (Op.getSimpleValueType().isVector())
14780 return lowerUINT_TO_FP_vec(Op, DAG);
14782 MVT SrcVT = N0.getSimpleValueType();
14783 MVT DstVT = Op.getSimpleValueType();
14785 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
14786 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
14787 // Conversions from unsigned i32 to f32/f64 are legal,
14788 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
14792 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14793 return LowerUINT_TO_FP_i64(Op, DAG);
14794 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14795 return LowerUINT_TO_FP_i32(Op, DAG);
14796 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14799 // Make a 64-bit buffer, and use it to build an FILD.
14800 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14801 if (SrcVT == MVT::i32) {
14802 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
14803 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14804 StackSlot, MachinePointerInfo());
14805 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
14806 OffsetSlot, MachinePointerInfo());
14807 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14811 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14812 SDValue ValueToStore = Op.getOperand(0);
14813 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
14814 // Bitcasting to f64 here allows us to do a single 64-bit store from
14815 // an SSE register, avoiding the store forwarding penalty that would come
14816 // with two 32-bit stores.
14817 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14818 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14819 MachinePointerInfo());
14820 // For i64 source, we need to add the appropriate power of 2 if the input
14821 // was negative. This is the same as the optimization in
14822 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14823 // we must be careful to do the computation in x87 extended precision, not
14824 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14825 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14826 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14827 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14828 MachineMemOperand::MOLoad, 8, 8);
14830 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14831 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14832 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14835 APInt FF(32, 0x5F800000ULL);
14837 // Check whether the sign bit is set.
14838 SDValue SignSet = DAG.getSetCC(
14839 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
14840 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
14842 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14843 SDValue FudgePtr = DAG.getConstantPool(
14844 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
14846 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14847 SDValue Zero = DAG.getIntPtrConstant(0, dl);
14848 SDValue Four = DAG.getIntPtrConstant(4, dl);
14849 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14851 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
14853 // Load the value out, extending it from f32 to f80.
14854 // FIXME: Avoid the extend by constructing the right constant pool?
14855 SDValue Fudge = DAG.getExtLoad(
14856 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
14857 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
14858 /* Alignment = */ 4);
14859 // Extend everything to 80 bits to force it to be done on x87.
14860 // TODO: Are there any fast-math-flags to propagate here?
14861 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14862 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
14863 DAG.getIntPtrConstant(0, dl));
14866 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
14867 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
14868 // just return an <SDValue(), SDValue()> pair.
14869 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
14870 // to i16, i32 or i64, and we lower it to a legal sequence.
14871 // If lowered to the final integer result we return a <result, SDValue()> pair.
14872 // Otherwise we lower it to a sequence ending with a FIST, return a
14873 // <FIST, StackSlot> pair, and the caller is responsible for loading
14874 // the final integer result from StackSlot.
14875 std::pair<SDValue,SDValue>
14876 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14877 bool IsSigned, bool IsReplace) const {
14880 EVT DstTy = Op.getValueType();
14881 EVT TheVT = Op.getOperand(0).getValueType();
14882 auto PtrVT = getPointerTy(DAG.getDataLayout());
14884 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
14885 // f16 must be promoted before using the lowering in this routine.
14886 // fp128 does not use this lowering.
14887 return std::make_pair(SDValue(), SDValue());
14890 // If using FIST to compute an unsigned i64, we'll need some fixup
14891 // to handle values above the maximum signed i64. A FIST is always
14892 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
14893 bool UnsignedFixup = !IsSigned &&
14894 DstTy == MVT::i64 &&
14895 (!Subtarget.is64Bit() ||
14896 !isScalarFPTypeInSSEReg(TheVT));
14898 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
14899 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
14900 // The low 32 bits of the fist result will have the correct uint32 result.
14901 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14905 assert(DstTy.getSimpleVT() <= MVT::i64 &&
14906 DstTy.getSimpleVT() >= MVT::i16 &&
14907 "Unknown FP_TO_INT to lower!");
14909 // These are really Legal.
14910 if (DstTy == MVT::i32 &&
14911 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14912 return std::make_pair(SDValue(), SDValue());
14913 if (Subtarget.is64Bit() &&
14914 DstTy == MVT::i64 &&
14915 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14916 return std::make_pair(SDValue(), SDValue());
14918 // We lower FP->int64 into FISTP64 followed by a load from a temporary
14920 MachineFunction &MF = DAG.getMachineFunction();
14921 unsigned MemSize = DstTy.getSizeInBits()/8;
14922 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
14923 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14926 switch (DstTy.getSimpleVT().SimpleTy) {
14927 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14928 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14929 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14930 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14933 SDValue Chain = DAG.getEntryNode();
14934 SDValue Value = Op.getOperand(0);
14935 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
14937 if (UnsignedFixup) {
14939 // Conversion to unsigned i64 is implemented with a select,
14940 // depending on whether the source value fits in the range
14941 // of a signed i64. Let Thresh be the FP equivalent of
14942 // 0x8000000000000000ULL.
14944 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
14945 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
14946 // Fist-to-mem64 FistSrc
14947 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
14948 // to XOR'ing the high 32 bits with Adjust.
14950 // Being a power of 2, Thresh is exactly representable in all FP formats.
14951 // For X87 we'd like to use the smallest FP type for this constant, but
14952 // for DAG type consistency we have to match the FP operand type.
14954 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
14955 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
14956 bool LosesInfo = false;
14957 if (TheVT == MVT::f64)
14958 // The rounding mode is irrelevant as the conversion should be exact.
14959 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
14961 else if (TheVT == MVT::f80)
14962 Status = Thresh.convert(APFloat::x87DoubleExtended(),
14963 APFloat::rmNearestTiesToEven, &LosesInfo);
14965 assert(Status == APFloat::opOK && !LosesInfo &&
14966 "FP conversion should have been exact");
14968 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
14970 SDValue Cmp = DAG.getSetCC(DL,
14971 getSetCCResultType(DAG.getDataLayout(),
14972 *DAG.getContext(), TheVT),
14973 Value, ThreshVal, ISD::SETLT);
14974 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
14975 DAG.getConstant(0, DL, MVT::i32),
14976 DAG.getConstant(0x80000000, DL, MVT::i32));
14977 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
14978 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
14979 *DAG.getContext(), TheVT),
14980 Value, ThreshVal, ISD::SETLT);
14981 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
14984 // FIXME This causes a redundant load/store if the SSE-class value is already
14985 // in memory, such as if it is on the callstack.
14986 if (isScalarFPTypeInSSEReg(TheVT)) {
14987 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14988 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14989 MachinePointerInfo::getFixedStack(MF, SSFI));
14990 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14992 Chain, StackSlot, DAG.getValueType(TheVT)
14995 MachineMemOperand *MMO =
14996 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
14997 MachineMemOperand::MOLoad, MemSize, MemSize);
14998 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14999 Chain = Value.getValue(1);
15000 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15001 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15004 MachineMemOperand *MMO =
15005 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15006 MachineMemOperand::MOStore, MemSize, MemSize);
15008 if (UnsignedFixup) {
15010 // Insert the FIST, load its result as two i32's,
15011 // and XOR the high i32 with Adjust.
15013 SDValue FistOps[] = { Chain, Value, StackSlot };
15014 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15015 FistOps, DstTy, MMO);
15018 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15019 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15022 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15023 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15025 if (Subtarget.is64Bit()) {
15026 // Join High32 and Low32 into a 64-bit result.
15027 // (High32 << 32) | Low32
15028 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15029 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15030 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15031 DAG.getConstant(32, DL, MVT::i8));
15032 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15033 return std::make_pair(Result, SDValue());
15036 SDValue ResultOps[] = { Low32, High32 };
15038 SDValue pair = IsReplace
15039 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15040 : DAG.getMergeValues(ResultOps, DL);
15041 return std::make_pair(pair, SDValue());
15043 // Build the FP_TO_INT*_IN_MEM
15044 SDValue Ops[] = { Chain, Value, StackSlot };
15045 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15047 return std::make_pair(FIST, StackSlot);
15051 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15052 const X86Subtarget &Subtarget) {
15053 MVT VT = Op->getSimpleValueType(0);
15054 SDValue In = Op->getOperand(0);
15055 MVT InVT = In.getSimpleValueType();
15058 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15059 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15061 // Optimize vectors in AVX mode:
15064 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15065 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15066 // Concat upper and lower parts.
15069 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15070 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15071 // Concat upper and lower parts.
15074 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15075 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15076 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15079 if (Subtarget.hasInt256())
15080 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15082 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15083 SDValue Undef = DAG.getUNDEF(InVT);
15084 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15085 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15086 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15088 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15089 VT.getVectorNumElements()/2);
15091 OpLo = DAG.getBitcast(HVT, OpLo);
15092 OpHi = DAG.getBitcast(HVT, OpHi);
15094 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15097 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15098 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15099 MVT VT = Op->getSimpleValueType(0);
15100 SDValue In = Op->getOperand(0);
15101 MVT InVT = In.getSimpleValueType();
15103 unsigned NumElts = VT.getVectorNumElements();
15104 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
15107 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
15108 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15110 assert(InVT.getVectorElementType() == MVT::i1);
15112 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15114 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15115 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15118 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15120 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15122 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15124 return SelectedVal;
15125 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15128 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15129 SelectionDAG &DAG) {
15130 if (Subtarget.hasFp256())
15131 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15137 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15138 SelectionDAG &DAG) {
15140 MVT VT = Op.getSimpleValueType();
15141 SDValue In = Op.getOperand(0);
15142 MVT SVT = In.getSimpleValueType();
15144 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15145 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15147 if (Subtarget.hasFp256())
15148 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15151 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15152 VT.getVectorNumElements() != SVT.getVectorNumElements());
15156 /// Helper to recursively truncate vector elements in half with PACKSS.
15157 /// It makes use of the fact that vector comparison results will be all-zeros
15158 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15159 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15160 /// within each 128-bit lane.
15161 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15164 const X86Subtarget &Subtarget) {
15165 // Requires SSE2 but AVX512 has fast truncate.
15166 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15169 EVT SrcVT = In.getValueType();
15171 // No truncation required, we might get here due to recursive calls.
15172 if (SrcVT == DstVT)
15175 // We only support vector truncation to 128bits or greater from a
15176 // 256bits or greater source.
15177 if ((DstVT.getSizeInBits() % 128) != 0)
15179 if ((SrcVT.getSizeInBits() % 256) != 0)
15182 unsigned NumElems = SrcVT.getVectorNumElements();
15183 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15184 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15187 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15189 // Extract lower/upper subvectors.
15190 unsigned NumSubElts = NumElems / 2;
15191 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15192 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15193 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15195 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15196 if (SrcVT.is256BitVector()) {
15197 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15198 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15199 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15200 return DAG.getBitcast(DstVT, Res);
15203 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15204 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15205 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15206 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15207 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15208 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15210 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15211 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15212 Res = DAG.getBitcast(MVT::v4i64, Res);
15213 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15215 if (DstVT.is256BitVector())
15216 return DAG.getBitcast(DstVT, Res);
15218 // If 512bit -> 128bit truncate another stage.
15219 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15220 Res = DAG.getBitcast(PackedVT, Res);
15221 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15224 // Recursively pack lower/upper subvectors, concat result and pack again.
15225 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15226 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15227 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15228 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15230 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15231 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15232 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15235 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15236 const X86Subtarget &Subtarget) {
15239 MVT VT = Op.getSimpleValueType();
15240 SDValue In = Op.getOperand(0);
15241 MVT InVT = In.getSimpleValueType();
15243 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15245 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15246 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15247 if (InVT.getScalarSizeInBits() <= 16) {
15248 if (Subtarget.hasBWI()) {
15249 // legal, will go to VPMOVB2M, VPMOVW2M
15250 // Shift packed bytes not supported natively, bitcast to word
15251 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15252 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15253 DAG.getBitcast(ExtVT, In),
15254 DAG.getConstant(ShiftInx, DL, ExtVT));
15255 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15256 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15258 // Use TESTD/Q, extended vector to packed dword/qword.
15259 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15260 "Unexpected vector type.");
15261 unsigned NumElts = InVT.getVectorNumElements();
15262 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15263 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15265 ShiftInx = InVT.getScalarSizeInBits() - 1;
15268 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15269 DAG.getConstant(ShiftInx, DL, InVT));
15270 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15273 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15275 MVT VT = Op.getSimpleValueType();
15276 SDValue In = Op.getOperand(0);
15277 MVT InVT = In.getSimpleValueType();
15279 if (VT == MVT::i1) {
15280 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15281 "Invalid scalar TRUNCATE operation");
15282 if (InVT.getSizeInBits() >= 32)
15284 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15285 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15287 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15288 "Invalid TRUNCATE operation");
15290 if (VT.getVectorElementType() == MVT::i1)
15291 return LowerTruncateVecI1(Op, DAG, Subtarget);
15293 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15294 if (Subtarget.hasAVX512()) {
15295 // word to byte only under BWI
15296 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15297 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15298 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
15299 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15302 // Truncate with PACKSS if we are truncating a vector comparison result.
15303 // TODO: We should be able to support other operations as long as we
15304 // we are saturating+packing zero/all bits only.
15305 auto IsPackableComparison = [](SDValue V) {
15306 unsigned Opcode = V.getOpcode();
15307 return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
15308 Opcode == X86ISD::CMPP);
15311 if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
15312 all_of(In->ops(), IsPackableComparison))) {
15313 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15317 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15318 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15319 if (Subtarget.hasInt256()) {
15320 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15321 In = DAG.getBitcast(MVT::v8i32, In);
15322 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
15324 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15325 DAG.getIntPtrConstant(0, DL));
15328 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15329 DAG.getIntPtrConstant(0, DL));
15330 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15331 DAG.getIntPtrConstant(2, DL));
15332 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15333 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15334 static const int ShufMask[] = {0, 2, 4, 6};
15335 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15338 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15339 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
15340 if (Subtarget.hasInt256()) {
15341 In = DAG.getBitcast(MVT::v32i8, In);
15343 SmallVector<SDValue,32> pshufbMask;
15344 for (unsigned i = 0; i < 2; ++i) {
15345 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
15346 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
15347 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
15348 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
15349 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
15350 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
15351 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
15352 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
15353 for (unsigned j = 0; j < 8; ++j)
15354 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
15356 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
15357 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
15358 In = DAG.getBitcast(MVT::v4i64, In);
15360 static const int ShufMask[] = {0, 2, -1, -1};
15361 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
15363 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15364 DAG.getIntPtrConstant(0, DL));
15365 return DAG.getBitcast(VT, In);
15368 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15369 DAG.getIntPtrConstant(0, DL));
15371 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15372 DAG.getIntPtrConstant(4, DL));
15374 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15375 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15377 // The PSHUFB mask:
15378 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15379 -1, -1, -1, -1, -1, -1, -1, -1};
15381 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
15382 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
15383 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
15385 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15386 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15388 // The MOVLHPS Mask:
15389 static const int ShufMask2[] = {0, 1, 4, 5};
15390 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15391 return DAG.getBitcast(MVT::v8i16, res);
15394 // Handle truncation of V256 to V128 using shuffles.
15395 if (!VT.is128BitVector() || !InVT.is256BitVector())
15398 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15400 unsigned NumElems = VT.getVectorNumElements();
15401 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15403 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15404 // Prepare truncation shuffle mask
15405 for (unsigned i = 0; i != NumElems; ++i)
15406 MaskVec[i] = i * 2;
15407 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
15408 DAG.getUNDEF(NVT), MaskVec);
15409 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15410 DAG.getIntPtrConstant(0, DL));
15413 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
15414 const X86Subtarget &Subtarget,
15415 SelectionDAG &DAG) const {
15416 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15418 MVT VT = Op.getSimpleValueType();
15420 if (VT.isVector()) {
15421 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15422 SDValue Src = Op.getOperand(0);
15424 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15425 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
15427 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15428 DAG.getUNDEF(MVT::v2f32)));
15434 assert(!VT.isVector());
15436 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15437 IsSigned, /*IsReplace=*/ false);
15438 SDValue FIST = Vals.first, StackSlot = Vals.second;
15439 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15440 if (!FIST.getNode())
15443 if (StackSlot.getNode())
15444 // Load the result.
15445 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15447 // The node is the result.
15451 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15453 MVT VT = Op.getSimpleValueType();
15454 SDValue In = Op.getOperand(0);
15455 MVT SVT = In.getSimpleValueType();
15457 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15459 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15460 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15461 In, DAG.getUNDEF(SVT)));
15464 /// The only differences between FABS and FNEG are the mask and the logic op.
15465 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15466 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15467 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15468 "Wrong opcode for lowering FABS or FNEG.");
15470 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15472 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15473 // into an FNABS. We'll lower the FABS after that if it is still in use.
15475 for (SDNode *User : Op->uses())
15476 if (User->getOpcode() == ISD::FNEG)
15480 MVT VT = Op.getSimpleValueType();
15482 bool IsF128 = (VT == MVT::f128);
15484 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15485 // decide if we should generate a 16-byte constant mask when we only need 4 or
15486 // 8 bytes for the scalar case.
15491 if (VT.isVector()) {
15493 EltVT = VT.getVectorElementType();
15494 } else if (IsF128) {
15495 // SSE instructions are used for optimized f128 logical operations.
15496 LogicVT = MVT::f128;
15499 // There are no scalar bitwise logical SSE/AVX instructions, so we
15500 // generate a 16-byte vector constant and logic op even for the scalar case.
15501 // Using a 16-byte mask allows folding the load of the mask with
15502 // the logic op, so it can save (~4 bytes) on code size.
15503 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15507 unsigned EltBits = EltVT.getSizeInBits();
15508 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15510 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
15511 const fltSemantics &Sem =
15512 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
15513 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15514 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
15516 SDValue Op0 = Op.getOperand(0);
15517 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15519 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15520 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15522 if (VT.isVector() || IsF128)
15523 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15525 // For the scalar case extend to a 128-bit vector, perform the logic op,
15526 // and extract the scalar result back out.
15527 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
15528 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15529 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
15530 DAG.getIntPtrConstant(0, dl));
15533 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15534 SDValue Mag = Op.getOperand(0);
15535 SDValue Sign = Op.getOperand(1);
15538 // If the sign operand is smaller, extend it first.
15539 MVT VT = Op.getSimpleValueType();
15540 if (Sign.getSimpleValueType().bitsLT(VT))
15541 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
15543 // And if it is bigger, shrink it first.
15544 if (Sign.getSimpleValueType().bitsGT(VT))
15545 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
15547 // At this point the operands and the result should have the same
15548 // type, and that won't be f80 since that is not custom lowered.
15549 bool IsF128 = (VT == MVT::f128);
15550 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
15551 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
15552 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
15553 "Unexpected type in LowerFCOPYSIGN");
15555 MVT EltVT = VT.getScalarType();
15556 const fltSemantics &Sem =
15557 EltVT == MVT::f64 ? APFloat::IEEEdouble()
15558 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15560 // Perform all scalar logic operations as 16-byte vectors because there are no
15561 // scalar FP logic instructions in SSE.
15562 // TODO: This isn't necessary. If we used scalar types, we might avoid some
15563 // unnecessary splats, but we might miss load folding opportunities. Should
15564 // this decision be based on OptimizeForSize?
15565 bool IsFakeVector = !VT.isVector() && !IsF128;
15568 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15570 // The mask constants are automatically splatted for vector types.
15571 unsigned EltSizeInBits = VT.getScalarSizeInBits();
15572 SDValue SignMask = DAG.getConstantFP(
15573 APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15574 SDValue MagMask = DAG.getConstantFP(
15575 APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15577 // First, clear all bits but the sign bit from the second operand (sign).
15579 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
15580 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
15582 // Next, clear the sign bit from the first operand (magnitude).
15583 // TODO: If we had general constant folding for FP logic ops, this check
15584 // wouldn't be necessary.
15586 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
15587 APFloat APF = Op0CN->getValueAPF();
15589 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
15591 // If the magnitude operand wasn't a constant, we need to AND out the sign.
15593 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
15594 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
15597 // OR the magnitude value with the sign bit.
15598 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
15599 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
15600 DAG.getIntPtrConstant(0, dl));
15603 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15604 SDValue N0 = Op.getOperand(0);
15606 MVT VT = Op.getSimpleValueType();
15608 MVT OpVT = N0.getSimpleValueType();
15609 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
15610 "Unexpected type for FGETSIGN");
15612 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
15613 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
15614 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
15615 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
15616 Res = DAG.getZExtOrTrunc(Res, dl, VT);
15617 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
15621 // Check whether an OR'd tree is PTEST-able.
15622 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
15623 SelectionDAG &DAG) {
15624 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15626 if (!Subtarget.hasSSE41())
15629 if (!Op->hasOneUse())
15632 SDNode *N = Op.getNode();
15635 SmallVector<SDValue, 8> Opnds;
15636 DenseMap<SDValue, unsigned> VecInMap;
15637 SmallVector<SDValue, 8> VecIns;
15638 EVT VT = MVT::Other;
15640 // Recognize a special case where a vector is casted into wide integer to
15642 Opnds.push_back(N->getOperand(0));
15643 Opnds.push_back(N->getOperand(1));
15645 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15646 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15647 // BFS traverse all OR'd operands.
15648 if (I->getOpcode() == ISD::OR) {
15649 Opnds.push_back(I->getOperand(0));
15650 Opnds.push_back(I->getOperand(1));
15651 // Re-evaluate the number of nodes to be traversed.
15652 e += 2; // 2 more nodes (LHS and RHS) are pushed.
15656 // Quit if a non-EXTRACT_VECTOR_ELT
15657 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15660 // Quit if without a constant index.
15661 SDValue Idx = I->getOperand(1);
15662 if (!isa<ConstantSDNode>(Idx))
15665 SDValue ExtractedFromVec = I->getOperand(0);
15666 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15667 if (M == VecInMap.end()) {
15668 VT = ExtractedFromVec.getValueType();
15669 // Quit if not 128/256-bit vector.
15670 if (!VT.is128BitVector() && !VT.is256BitVector())
15672 // Quit if not the same type.
15673 if (VecInMap.begin() != VecInMap.end() &&
15674 VT != VecInMap.begin()->first.getValueType())
15676 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15677 VecIns.push_back(ExtractedFromVec);
15679 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15682 assert((VT.is128BitVector() || VT.is256BitVector()) &&
15683 "Not extracted from 128-/256-bit vector.");
15685 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15687 for (DenseMap<SDValue, unsigned>::const_iterator
15688 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15689 // Quit if not all elements are used.
15690 if (I->second != FullMask)
15694 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15696 // Cast all vectors into TestVT for PTEST.
15697 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15698 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
15700 // If more than one full vectors are evaluated, OR them first before PTEST.
15701 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15702 // Each iteration will OR 2 nodes and append the result until there is only
15703 // 1 node left, i.e. the final OR'd value of all vectors.
15704 SDValue LHS = VecIns[Slot];
15705 SDValue RHS = VecIns[Slot + 1];
15706 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15709 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15710 VecIns.back(), VecIns.back());
15713 /// \brief return true if \c Op has a use that doesn't just read flags.
15714 static bool hasNonFlagsUse(SDValue Op) {
15715 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15717 SDNode *User = *UI;
15718 unsigned UOpNo = UI.getOperandNo();
15719 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15720 // Look pass truncate.
15721 UOpNo = User->use_begin().getOperandNo();
15722 User = *User->use_begin();
15725 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15726 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15732 // Emit KTEST instruction for bit vectors on AVX-512
15733 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
15734 const X86Subtarget &Subtarget) {
15735 if (Op.getOpcode() == ISD::BITCAST) {
15736 auto hasKTEST = [&](MVT VT) {
15737 unsigned SizeInBits = VT.getSizeInBits();
15738 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
15739 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
15741 SDValue Op0 = Op.getOperand(0);
15742 MVT Op0VT = Op0.getValueType().getSimpleVT();
15743 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
15745 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
15750 /// Emit nodes that will be selected as "test Op0,Op0", or something
15752 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
15753 SelectionDAG &DAG) const {
15754 if (Op.getValueType() == MVT::i1) {
15755 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15756 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15757 DAG.getConstant(0, dl, MVT::i8));
15759 // CF and OF aren't always set the way we want. Determine which
15760 // of these we need.
15761 bool NeedCF = false;
15762 bool NeedOF = false;
15765 case X86::COND_A: case X86::COND_AE:
15766 case X86::COND_B: case X86::COND_BE:
15769 case X86::COND_G: case X86::COND_GE:
15770 case X86::COND_L: case X86::COND_LE:
15771 case X86::COND_O: case X86::COND_NO: {
15772 // Check if we really need to set the
15773 // Overflow flag. If NoSignedWrap is present
15774 // that is not actually needed.
15775 switch (Op->getOpcode()) {
15780 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
15781 if (BinNode->Flags.hasNoSignedWrap())
15791 // See if we can use the EFLAGS value from the operand instead of
15792 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15793 // we prove that the arithmetic won't overflow, we can't use OF or CF.
15794 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15795 // Emit KTEST for bit vectors
15796 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15798 // Emit a CMP with 0, which is the TEST pattern.
15799 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15800 DAG.getConstant(0, dl, Op.getValueType()));
15802 unsigned Opcode = 0;
15803 unsigned NumOperands = 0;
15805 // Truncate operations may prevent the merge of the SETCC instruction
15806 // and the arithmetic instruction before it. Attempt to truncate the operands
15807 // of the arithmetic instruction and use a reduced bit-width instruction.
15808 bool NeedTruncation = false;
15809 SDValue ArithOp = Op;
15810 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15811 SDValue Arith = Op->getOperand(0);
15812 // Both the trunc and the arithmetic op need to have one user each.
15813 if (Arith->hasOneUse())
15814 switch (Arith.getOpcode()) {
15821 NeedTruncation = true;
15827 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15828 // which may be the result of a CAST. We use the variable 'Op', which is the
15829 // non-casted variable when we check for possible users.
15830 switch (ArithOp.getOpcode()) {
15832 // Due to an isel shortcoming, be conservative if this add is likely to be
15833 // selected as part of a load-modify-store instruction. When the root node
15834 // in a match is a store, isel doesn't know how to remap non-chain non-flag
15835 // uses of other nodes in the match, such as the ADD in this case. This
15836 // leads to the ADD being left around and reselected, with the result being
15837 // two adds in the output. Alas, even if none our users are stores, that
15838 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
15839 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
15840 // climbing the DAG back to the root, and it doesn't seem to be worth the
15842 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15843 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15844 if (UI->getOpcode() != ISD::CopyToReg &&
15845 UI->getOpcode() != ISD::SETCC &&
15846 UI->getOpcode() != ISD::STORE)
15849 if (ConstantSDNode *C =
15850 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
15851 // An add of one will be selected as an INC.
15852 if (C->isOne() && !Subtarget.slowIncDec()) {
15853 Opcode = X86ISD::INC;
15858 // An add of negative one (subtract of one) will be selected as a DEC.
15859 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
15860 Opcode = X86ISD::DEC;
15866 // Otherwise use a regular EFLAGS-setting add.
15867 Opcode = X86ISD::ADD;
15872 // If we have a constant logical shift that's only used in a comparison
15873 // against zero turn it into an equivalent AND. This allows turning it into
15874 // a TEST instruction later.
15875 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15876 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15877 EVT VT = Op.getValueType();
15878 unsigned BitWidth = VT.getSizeInBits();
15879 unsigned ShAmt = Op->getConstantOperandVal(1);
15880 if (ShAmt >= BitWidth) // Avoid undefined shifts.
15882 APInt Mask = ArithOp.getOpcode() == ISD::SRL
15883 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15884 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15885 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15887 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15888 DAG.getConstant(Mask, dl, VT));
15893 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
15894 // because a TEST instruction will be better.
15895 if (!hasNonFlagsUse(Op)) {
15896 SDValue Op0 = ArithOp->getOperand(0);
15897 SDValue Op1 = ArithOp->getOperand(1);
15898 EVT VT = ArithOp.getValueType();
15899 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
15900 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
15902 // But if we can combine this into an ANDN operation, then create an AND
15903 // now and allow it to be pattern matched into an ANDN.
15904 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
15911 // Due to the ISEL shortcoming noted above, be conservative if this op is
15912 // likely to be selected as part of a load-modify-store instruction.
15913 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15914 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15915 if (UI->getOpcode() == ISD::STORE)
15918 // Otherwise use a regular EFLAGS-setting instruction.
15919 switch (ArithOp.getOpcode()) {
15920 default: llvm_unreachable("unexpected operator!");
15921 case ISD::SUB: Opcode = X86ISD::SUB; break;
15922 case ISD::XOR: Opcode = X86ISD::XOR; break;
15923 case ISD::AND: Opcode = X86ISD::AND; break;
15925 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15926 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
15929 Opcode = X86ISD::OR;
15943 return SDValue(Op.getNode(), 1);
15949 // If we found that truncation is beneficial, perform the truncation and
15951 if (NeedTruncation) {
15952 EVT VT = Op.getValueType();
15953 SDValue WideVal = Op->getOperand(0);
15954 EVT WideVT = WideVal.getValueType();
15955 unsigned ConvertedOp = 0;
15956 // Use a target machine opcode to prevent further DAGCombine
15957 // optimizations that may separate the arithmetic operations
15958 // from the setcc node.
15959 switch (WideVal.getOpcode()) {
15961 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15962 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15963 case ISD::AND: ConvertedOp = X86ISD::AND; break;
15964 case ISD::OR: ConvertedOp = X86ISD::OR; break;
15965 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15970 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15971 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15972 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15973 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15979 // Emit KTEST for bit vectors
15980 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15983 // Emit a CMP with 0, which is the TEST pattern.
15984 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15985 DAG.getConstant(0, dl, Op.getValueType()));
15987 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15988 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
15990 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15991 DAG.ReplaceAllUsesWith(Op, New);
15992 return SDValue(New.getNode(), 1);
15995 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15997 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15998 const SDLoc &dl, SelectionDAG &DAG) const {
15999 if (isNullConstant(Op1))
16000 return EmitTest(Op0, X86CC, dl, DAG);
16002 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16003 "Unexpected comparison operation for MVT::i1 operands");
16005 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16006 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16007 // Only promote the compare up to I32 if it is a 16 bit operation
16008 // with an immediate. 16 bit immediates are to be avoided.
16009 if ((Op0.getValueType() == MVT::i16 &&
16010 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16011 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16012 !Subtarget.isAtom()) {
16013 unsigned ExtendOp =
16014 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16015 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16016 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16018 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16019 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16020 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16022 return SDValue(Sub.getNode(), 1);
16024 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16027 /// Convert a comparison if required by the subtarget.
16028 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16029 SelectionDAG &DAG) const {
16030 // If the subtarget does not support the FUCOMI instruction, floating-point
16031 // comparisons have to be converted.
16032 if (Subtarget.hasCMov() ||
16033 Cmp.getOpcode() != X86ISD::CMP ||
16034 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16035 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16038 // The instruction selector will select an FUCOM instruction instead of
16039 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16040 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16041 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16043 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16044 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16045 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16046 DAG.getConstant(8, dl, MVT::i8));
16047 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16049 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16050 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16051 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16054 /// Check if replacement of SQRT with RSQRT should be disabled.
16055 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16056 EVT VT = Op.getValueType();
16058 // We never want to use both SQRT and RSQRT instructions for the same input.
16059 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16063 return Subtarget.hasFastVectorFSQRT();
16064 return Subtarget.hasFastScalarFSQRT();
16067 /// The minimum architected relative accuracy is 2^-12. We need one
16068 /// Newton-Raphson step to have a good float result (24 bits of precision).
16069 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16070 SelectionDAG &DAG, int Enabled,
16071 int &RefinementSteps,
16072 bool &UseOneConstNR,
16073 bool Reciprocal) const {
16074 EVT VT = Op.getValueType();
16076 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16077 // TODO: Add support for AVX512 (v16f32).
16078 // It is likely not profitable to do this for f64 because a double-precision
16079 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16080 // instructions: convert to single, rsqrtss, convert back to double, refine
16081 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16082 // along with FMA, this could be a throughput win.
16083 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16084 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16085 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16086 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16087 RefinementSteps = 1;
16089 UseOneConstNR = false;
16090 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16095 /// The minimum architected relative accuracy is 2^-12. We need one
16096 /// Newton-Raphson step to have a good float result (24 bits of precision).
16097 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16099 int &RefinementSteps) const {
16100 EVT VT = Op.getValueType();
16102 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16103 // TODO: Add support for AVX512 (v16f32).
16104 // It is likely not profitable to do this for f64 because a double-precision
16105 // reciprocal estimate with refinement on x86 prior to FMA requires
16106 // 15 instructions: convert to single, rcpss, convert back to double, refine
16107 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16108 // along with FMA, this could be a throughput win.
16110 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16111 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16112 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16113 // Enable estimate codegen with 1 refinement step for vector division.
16114 // Scalar division estimates are disabled because they break too much
16115 // real-world code. These defaults are intended to match GCC behavior.
16116 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16119 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16120 RefinementSteps = 1;
16122 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16127 /// If we have at least two divisions that use the same divisor, convert to
16128 /// multplication by a reciprocal. This may need to be adjusted for a given
16129 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16130 /// This is because we still need one division to calculate the reciprocal and
16131 /// then we need two multiplies by that reciprocal as replacements for the
16132 /// original divisions.
16133 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16137 /// Helper for creating a X86ISD::SETCC node.
16138 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16139 SelectionDAG &DAG) {
16140 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16141 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16144 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16145 /// according to equal/not-equal condition code \p CC.
16146 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16147 const SDLoc &dl, SelectionDAG &DAG) {
16148 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16149 // instruction. Since the shift amount is in-range-or-undefined, we know
16150 // that doing a bittest on the i32 value is ok. We extend to i32 because
16151 // the encoding for the i16 version is larger than the i32 version.
16152 // Also promote i16 to i32 for performance / code size reason.
16153 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16154 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16156 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16157 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16158 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16159 // known to be zero.
16160 if (Src.getValueType() == MVT::i64 &&
16161 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16162 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16164 // If the operand types disagree, extend the shift amount to match. Since
16165 // BT ignores high bits (like shifts) we can use anyextend.
16166 if (Src.getValueType() != BitNo.getValueType())
16167 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16169 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16170 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16171 return getSETCC(Cond, BT, dl , DAG);
16174 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16175 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16176 const SDLoc &dl, SelectionDAG &DAG) {
16177 SDValue Op0 = And.getOperand(0);
16178 SDValue Op1 = And.getOperand(1);
16179 if (Op0.getOpcode() == ISD::TRUNCATE)
16180 Op0 = Op0.getOperand(0);
16181 if (Op1.getOpcode() == ISD::TRUNCATE)
16182 Op1 = Op1.getOperand(0);
16185 if (Op1.getOpcode() == ISD::SHL)
16186 std::swap(Op0, Op1);
16187 if (Op0.getOpcode() == ISD::SHL) {
16188 if (isOneConstant(Op0.getOperand(0))) {
16189 // If we looked past a truncate, check that it's only truncating away
16191 unsigned BitWidth = Op0.getValueSizeInBits();
16192 unsigned AndBitWidth = And.getValueSizeInBits();
16193 if (BitWidth > AndBitWidth) {
16195 DAG.computeKnownBits(Op0, Zeros, Ones);
16196 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16200 RHS = Op0.getOperand(1);
16202 } else if (Op1.getOpcode() == ISD::Constant) {
16203 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16204 uint64_t AndRHSVal = AndRHS->getZExtValue();
16205 SDValue AndLHS = Op0;
16207 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16208 LHS = AndLHS.getOperand(0);
16209 RHS = AndLHS.getOperand(1);
16212 // Use BT if the immediate can't be encoded in a TEST instruction.
16213 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16215 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16220 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16225 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16226 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16227 const SDLoc &dl, SelectionDAG &DAG) {
16229 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16230 "Expected TRUNCATE to i1 node");
16232 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16235 SDValue ShiftRight = Op.getOperand(0);
16236 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16240 /// Result of 'and' or 'trunc to i1' is compared against zero.
16241 /// Change to a BT node if possible.
16242 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16243 const SDLoc &dl, SelectionDAG &DAG) const {
16244 if (Op.getOpcode() == ISD::AND)
16245 return LowerAndToBT(Op, CC, dl, DAG);
16246 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16247 return LowerTruncateToBT(Op, CC, dl, DAG);
16251 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16253 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16258 // SSE Condition code mapping:
16267 switch (SetCCOpcode) {
16268 default: llvm_unreachable("Unexpected SETCC condition");
16270 case ISD::SETEQ: SSECC = 0; break;
16272 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16274 case ISD::SETOLT: SSECC = 1; break;
16276 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16278 case ISD::SETOLE: SSECC = 2; break;
16279 case ISD::SETUO: SSECC = 3; break;
16281 case ISD::SETNE: SSECC = 4; break;
16282 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16283 case ISD::SETUGE: SSECC = 5; break;
16284 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16285 case ISD::SETUGT: SSECC = 6; break;
16286 case ISD::SETO: SSECC = 7; break;
16288 case ISD::SETONE: SSECC = 8; break;
16291 std::swap(Op0, Op1);
16296 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16297 /// concatenate the result back.
16298 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16299 MVT VT = Op.getSimpleValueType();
16301 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16302 "Unsupported value type for operation");
16304 unsigned NumElems = VT.getVectorNumElements();
16306 SDValue CC = Op.getOperand(2);
16308 // Extract the LHS vectors
16309 SDValue LHS = Op.getOperand(0);
16310 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16311 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16313 // Extract the RHS vectors
16314 SDValue RHS = Op.getOperand(1);
16315 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16316 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16318 // Issue the operation on the smaller types and concatenate the result back
16319 MVT EltVT = VT.getVectorElementType();
16320 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16321 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16322 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16323 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16326 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16327 SDValue Op0 = Op.getOperand(0);
16328 SDValue Op1 = Op.getOperand(1);
16329 SDValue CC = Op.getOperand(2);
16330 MVT VT = Op.getSimpleValueType();
16333 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16334 "Unexpected type for boolean compare operation");
16335 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16336 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16337 DAG.getConstant(-1, dl, VT));
16338 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16339 DAG.getConstant(-1, dl, VT));
16340 switch (SetCCOpcode) {
16341 default: llvm_unreachable("Unexpected SETCC condition");
16343 // (x == y) -> ~(x ^ y)
16344 return DAG.getNode(ISD::XOR, dl, VT,
16345 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16346 DAG.getConstant(-1, dl, VT));
16348 // (x != y) -> (x ^ y)
16349 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16352 // (x > y) -> (x & ~y)
16353 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16356 // (x < y) -> (~x & y)
16357 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16360 // (x <= y) -> (~x | y)
16361 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16364 // (x >=y) -> (x | ~y)
16365 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16369 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16371 SDValue Op0 = Op.getOperand(0);
16372 SDValue Op1 = Op.getOperand(1);
16373 SDValue CC = Op.getOperand(2);
16374 MVT VT = Op.getSimpleValueType();
16377 assert(VT.getVectorElementType() == MVT::i1 &&
16378 "Cannot set masked compare for this operation");
16380 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16382 bool Unsigned = false;
16385 switch (SetCCOpcode) {
16386 default: llvm_unreachable("Unexpected SETCC condition");
16387 case ISD::SETNE: SSECC = 4; break;
16388 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16389 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16390 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16391 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16392 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16393 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16394 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16395 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16396 case ISD::SETLE: SSECC = 2; break;
16400 std::swap(Op0, Op1);
16402 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16403 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16404 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16405 DAG.getConstant(SSECC, dl, MVT::i8));
16408 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16409 /// operand \p Op1. If non-trivial (for example because it's not constant)
16410 /// return an empty value.
16411 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16412 SelectionDAG &DAG) {
16413 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16417 MVT VT = Op1.getSimpleValueType();
16418 MVT EVT = VT.getVectorElementType();
16419 unsigned n = VT.getVectorNumElements();
16420 SmallVector<SDValue, 8> ULTOp1;
16422 for (unsigned i = 0; i < n; ++i) {
16423 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16424 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16427 // Avoid underflow.
16428 APInt Val = Elt->getAPIntValue();
16432 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16435 return DAG.getBuildVector(VT, dl, ULTOp1);
16438 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16439 SelectionDAG &DAG) {
16440 SDValue Op0 = Op.getOperand(0);
16441 SDValue Op1 = Op.getOperand(1);
16442 SDValue CC = Op.getOperand(2);
16443 MVT VT = Op.getSimpleValueType();
16444 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16445 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
16450 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
16451 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
16455 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
16456 assert(VT.getVectorNumElements() <= 16);
16457 Opc = X86ISD::CMPM;
16459 Opc = X86ISD::CMPP;
16460 // The SSE/AVX packed FP comparison nodes are defined with a
16461 // floating-point vector result that matches the operand type. This allows
16462 // them to work with an SSE1 target (integer vector types are not legal).
16463 VT = Op0.getSimpleValueType();
16466 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
16467 // emit two comparisons and a logic op to tie them together.
16468 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
16471 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
16473 // LLVM predicate is SETUEQ or SETONE.
16475 unsigned CombineOpc;
16476 if (SetCCOpcode == ISD::SETUEQ) {
16479 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
16480 static_cast<unsigned>(ISD::OR);
16482 assert(SetCCOpcode == ISD::SETONE);
16485 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
16486 static_cast<unsigned>(ISD::AND);
16489 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16490 DAG.getConstant(CC0, dl, MVT::i8));
16491 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16492 DAG.getConstant(CC1, dl, MVT::i8));
16493 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
16495 // Handle all other FP comparisons here.
16496 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
16497 DAG.getConstant(SSECC, dl, MVT::i8));
16500 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
16501 // result type of SETCC. The bitcast is expected to be optimized away
16502 // during combining/isel.
16503 if (Opc == X86ISD::CMPP)
16504 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
16509 MVT VTOp0 = Op0.getSimpleValueType();
16510 assert(VTOp0 == Op1.getSimpleValueType() &&
16511 "Expected operands with same type!");
16512 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
16513 "Invalid number of packed elements for source and destination!");
16515 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
16516 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
16517 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
16518 // legalizer firstly checks if the first operand in input to the setcc has
16519 // a legal type. If so, then it promotes the return type to that same type.
16520 // Otherwise, the return type is promoted to the 'next legal type' which,
16521 // for a vector of MVT::i1 is always a 128-bit integer vector type.
16523 // We reach this code only if the following two conditions are met:
16524 // 1. Both return type and operand type have been promoted to wider types
16525 // by the type legalizer.
16526 // 2. The original operand type has been promoted to a 256-bit vector.
16528 // Note that condition 2. only applies for AVX targets.
16529 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
16530 return DAG.getZExtOrTrunc(NewOp, dl, VT);
16533 // The non-AVX512 code below works under the assumption that source and
16534 // destination types are the same.
16535 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
16536 "Value types for source and destination must be the same!");
16538 // Break 256-bit integer vector compare into smaller ones.
16539 if (VT.is256BitVector() && !Subtarget.hasInt256())
16540 return Lower256IntVSETCC(Op, DAG);
16542 // Operands are boolean (vectors of i1)
16543 MVT OpVT = Op1.getSimpleValueType();
16544 if (OpVT.getVectorElementType() == MVT::i1)
16545 return LowerBoolVSETCC_AVX512(Op, DAG);
16547 // The result is boolean, but operands are int/float
16548 if (VT.getVectorElementType() == MVT::i1) {
16549 // In AVX-512 architecture setcc returns mask with i1 elements,
16550 // But there is no compare instruction for i8 and i16 elements in KNL.
16551 // In this case use SSE compare
16552 bool UseAVX512Inst =
16553 (OpVT.is512BitVector() ||
16554 OpVT.getScalarSizeInBits() >= 32 ||
16555 (Subtarget.hasBWI() && Subtarget.hasVLX()));
16558 return LowerIntVSETCC_AVX512(Op, DAG);
16560 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16561 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
16564 // Lower using XOP integer comparisons.
16565 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
16566 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
16567 // Translate compare code to XOP PCOM compare mode.
16568 unsigned CmpMode = 0;
16569 switch (SetCCOpcode) {
16570 default: llvm_unreachable("Unexpected SETCC condition");
16572 case ISD::SETLT: CmpMode = 0x00; break;
16574 case ISD::SETLE: CmpMode = 0x01; break;
16576 case ISD::SETGT: CmpMode = 0x02; break;
16578 case ISD::SETGE: CmpMode = 0x03; break;
16579 case ISD::SETEQ: CmpMode = 0x04; break;
16580 case ISD::SETNE: CmpMode = 0x05; break;
16583 // Are we comparing unsigned or signed integers?
16584 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
16585 ? X86ISD::VPCOMU : X86ISD::VPCOM;
16587 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16588 DAG.getConstant(CmpMode, dl, MVT::i8));
16591 // We are handling one of the integer comparisons here. Since SSE only has
16592 // GT and EQ comparisons for integer, swapping operands and multiple
16593 // operations may be required for some comparisons.
16595 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
16596 bool Subus = false;
16598 switch (SetCCOpcode) {
16599 default: llvm_unreachable("Unexpected SETCC condition");
16600 case ISD::SETNE: Invert = true;
16601 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
16602 case ISD::SETLT: Swap = true;
16603 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
16604 case ISD::SETGE: Swap = true;
16605 case ISD::SETLE: Opc = X86ISD::PCMPGT;
16606 Invert = true; break;
16607 case ISD::SETULT: Swap = true;
16608 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
16609 FlipSigns = true; break;
16610 case ISD::SETUGE: Swap = true;
16611 case ISD::SETULE: Opc = X86ISD::PCMPGT;
16612 FlipSigns = true; Invert = true; break;
16615 // Special case: Use min/max operations for SETULE/SETUGE
16616 MVT VET = VT.getVectorElementType();
16618 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
16619 || (Subtarget.hasSSE2() && (VET == MVT::i8));
16622 switch (SetCCOpcode) {
16624 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
16625 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
16628 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
16631 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
16632 if (!MinMax && hasSubus) {
16633 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
16635 // t = psubus Op0, Op1
16636 // pcmpeq t, <0..0>
16637 switch (SetCCOpcode) {
16639 case ISD::SETULT: {
16640 // If the comparison is against a constant we can turn this into a
16641 // setule. With psubus, setule does not require a swap. This is
16642 // beneficial because the constant in the register is no longer
16643 // destructed as the destination so it can be hoisted out of a loop.
16644 // Only do this pre-AVX since vpcmp* is no longer destructive.
16645 if (Subtarget.hasAVX())
16647 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
16649 Subus = true; Invert = false; Swap = false;
16653 // Psubus is better than flip-sign because it requires no inversion.
16654 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
16655 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
16659 Opc = X86ISD::SUBUS;
16665 std::swap(Op0, Op1);
16667 // Check that the operation in question is available (most are plain SSE2,
16668 // but PCMPGTQ and PCMPEQQ have different requirements).
16669 if (VT == MVT::v2i64) {
16670 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
16671 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
16673 // First cast everything to the right type.
16674 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16675 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16677 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16678 // bits of the inputs before performing those operations. The lower
16679 // compare is always unsigned.
16682 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
16684 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
16685 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
16686 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
16688 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
16689 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
16691 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
16692 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
16693 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
16695 // Create masks for only the low parts/high parts of the 64 bit integers.
16696 static const int MaskHi[] = { 1, 1, 3, 3 };
16697 static const int MaskLo[] = { 0, 0, 2, 2 };
16698 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
16699 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
16700 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
16702 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
16703 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
16706 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16708 return DAG.getBitcast(VT, Result);
16711 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
16712 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
16713 // pcmpeqd + pshufd + pand.
16714 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
16716 // First cast everything to the right type.
16717 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16718 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16721 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
16723 // Make sure the lower and upper halves are both all-ones.
16724 static const int Mask[] = { 1, 0, 3, 2 };
16725 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
16726 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
16729 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16731 return DAG.getBitcast(VT, Result);
16735 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16736 // bits of the inputs before performing those operations.
16738 MVT EltVT = VT.getVectorElementType();
16739 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
16741 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
16742 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
16745 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16747 // If the logical-not of the result is required, perform that now.
16749 Result = DAG.getNOT(dl, Result, VT);
16752 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16755 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16756 getZeroVector(VT, Subtarget, DAG, dl));
16761 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16763 MVT VT = Op.getSimpleValueType();
16765 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
16767 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
16768 && "SetCC type must be 8-bit or 1-bit integer");
16769 SDValue Op0 = Op.getOperand(0);
16770 SDValue Op1 = Op.getOperand(1);
16772 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16774 // Optimize to BT if possible.
16775 // Lower (X & (1 << N)) == 0 to BT(X, N).
16776 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
16777 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
16778 // Lower (trunc (X >> N) to i1) to BT(X, N).
16779 if (Op0.hasOneUse() && isNullConstant(Op1) &&
16780 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16781 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
16783 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
16788 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
16790 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
16791 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16793 // If the input is a setcc, then reuse the input setcc or use a new one with
16794 // the inverted condition.
16795 if (Op0.getOpcode() == X86ISD::SETCC) {
16796 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
16797 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
16801 CCode = X86::GetOppositeBranchCondition(CCode);
16802 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
16804 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16808 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16809 if (isOneConstant(Op1)) {
16810 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
16811 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
16813 if (!isNullConstant(Op1)) {
16814 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
16815 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
16819 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
16820 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
16821 if (X86CC == X86::COND_INVALID)
16824 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
16825 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
16826 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
16828 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16832 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
16833 SDValue LHS = Op.getOperand(0);
16834 SDValue RHS = Op.getOperand(1);
16835 SDValue Carry = Op.getOperand(2);
16836 SDValue Cond = Op.getOperand(3);
16839 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
16840 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
16842 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
16843 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16844 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
16845 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
16846 if (Op.getSimpleValueType() == MVT::i1)
16847 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
16851 /// Return true if opcode is a X86 logical comparison.
16852 static bool isX86LogicalCmp(SDValue Op) {
16853 unsigned Opc = Op.getOpcode();
16854 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
16855 Opc == X86ISD::SAHF)
16857 if (Op.getResNo() == 1 &&
16858 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
16859 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
16860 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
16861 Opc == X86ISD::XOR || Opc == X86ISD::AND))
16864 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
16870 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
16871 if (V.getOpcode() != ISD::TRUNCATE)
16874 SDValue VOp0 = V.getOperand(0);
16875 unsigned InBits = VOp0.getValueSizeInBits();
16876 unsigned Bits = V.getValueSizeInBits();
16877 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
16880 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16881 bool AddTest = true;
16882 SDValue Cond = Op.getOperand(0);
16883 SDValue Op1 = Op.getOperand(1);
16884 SDValue Op2 = Op.getOperand(2);
16886 MVT VT = Op1.getSimpleValueType();
16889 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
16890 // are available or VBLENDV if AVX is available.
16891 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
16892 if (Cond.getOpcode() == ISD::SETCC &&
16893 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
16894 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
16895 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
16896 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
16897 int SSECC = translateX86FSETCC(
16898 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
16901 if (Subtarget.hasAVX512()) {
16902 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
16903 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
16904 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
16905 DL, VT, Cmp, Op1, Op2);
16908 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16909 DAG.getConstant(SSECC, DL, MVT::i8));
16911 // If we have AVX, we can use a variable vector select (VBLENDV) instead
16912 // of 3 logic instructions for size savings and potentially speed.
16913 // Unfortunately, there is no scalar form of VBLENDV.
16915 // If either operand is a constant, don't try this. We can expect to
16916 // optimize away at least one of the logic instructions later in that
16917 // case, so that sequence would be faster than a variable blend.
16919 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
16920 // uses XMM0 as the selection register. That may need just as many
16921 // instructions as the AND/ANDN/OR sequence due to register moves, so
16924 if (Subtarget.hasAVX() &&
16925 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
16927 // Convert to vectors, do a VSELECT, and convert back to scalar.
16928 // All of the conversions should be optimized away.
16930 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
16931 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
16932 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
16933 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
16935 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
16936 VCmp = DAG.getBitcast(VCmpVT, VCmp);
16938 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
16940 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
16941 VSel, DAG.getIntPtrConstant(0, DL));
16943 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16944 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16945 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16949 // AVX512 fallback is to lower selects of scalar floats to masked moves.
16950 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
16951 Subtarget.hasAVX512())
16952 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
16954 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
16956 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
16957 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
16958 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
16959 Op1Scalar = Op1.getOperand(0);
16961 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
16962 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
16963 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
16964 Op2Scalar = Op2.getOperand(0);
16965 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
16966 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
16967 Op1Scalar.getValueType(),
16968 Cond, Op1Scalar, Op2Scalar);
16969 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
16970 return DAG.getBitcast(VT, newSelect);
16971 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
16972 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
16973 DAG.getIntPtrConstant(0, DL));
16977 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
16978 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
16979 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
16980 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
16981 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
16982 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
16983 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
16985 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
16988 if (Cond.getOpcode() == ISD::SETCC)
16989 if (SDValue NewCond = LowerSETCC(Cond, DAG))
16992 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16993 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16994 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16995 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16996 if (Cond.getOpcode() == X86ISD::SETCC &&
16997 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16998 isNullConstant(Cond.getOperand(1).getOperand(1))) {
16999 SDValue Cmp = Cond.getOperand(1);
17001 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17003 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17004 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17005 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17007 SDValue CmpOp0 = Cmp.getOperand(0);
17008 // Apply further optimizations for special cases
17009 // (select (x != 0), -1, 0) -> neg & sbb
17010 // (select (x == 0), 0, -1) -> neg & sbb
17011 if (isNullConstant(Y) &&
17012 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17013 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17014 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17015 DAG.getConstant(0, DL,
17016 CmpOp0.getValueType()),
17018 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17019 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17020 SDValue(Neg.getNode(), 1));
17024 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17025 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17026 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17028 SDValue Res = // Res = 0 or -1.
17029 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17030 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17032 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17033 Res = DAG.getNOT(DL, Res, Res.getValueType());
17035 if (!isNullConstant(Op2))
17036 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17041 // Look past (and (setcc_carry (cmp ...)), 1).
17042 if (Cond.getOpcode() == ISD::AND &&
17043 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17044 isOneConstant(Cond.getOperand(1)))
17045 Cond = Cond.getOperand(0);
17047 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17048 // setting operand in place of the X86ISD::SETCC.
17049 unsigned CondOpcode = Cond.getOpcode();
17050 if (CondOpcode == X86ISD::SETCC ||
17051 CondOpcode == X86ISD::SETCC_CARRY) {
17052 CC = Cond.getOperand(0);
17054 SDValue Cmp = Cond.getOperand(1);
17055 unsigned Opc = Cmp.getOpcode();
17056 MVT VT = Op.getSimpleValueType();
17058 bool IllegalFPCMov = false;
17059 if (VT.isFloatingPoint() && !VT.isVector() &&
17060 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17061 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17063 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17064 Opc == X86ISD::BT) { // FIXME
17068 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17069 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17070 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17071 Cond.getOperand(0).getValueType() != MVT::i8)) {
17072 SDValue LHS = Cond.getOperand(0);
17073 SDValue RHS = Cond.getOperand(1);
17074 unsigned X86Opcode;
17077 switch (CondOpcode) {
17078 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17079 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17080 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17081 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17082 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17083 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17084 default: llvm_unreachable("unexpected overflowing operator");
17086 if (CondOpcode == ISD::UMULO)
17087 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17090 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17092 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17094 if (CondOpcode == ISD::UMULO)
17095 Cond = X86Op.getValue(2);
17097 Cond = X86Op.getValue(1);
17099 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17104 // Look past the truncate if the high bits are known zero.
17105 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17106 Cond = Cond.getOperand(0);
17108 // We know the result of AND is compared against zero. Try to match
17110 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17111 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17112 CC = NewSetCC.getOperand(0);
17113 Cond = NewSetCC.getOperand(1);
17120 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17121 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17124 // a < b ? -1 : 0 -> RES = ~setcc_carry
17125 // a < b ? 0 : -1 -> RES = setcc_carry
17126 // a >= b ? -1 : 0 -> RES = setcc_carry
17127 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17128 if (Cond.getOpcode() == X86ISD::SUB) {
17129 Cond = ConvertCmpIfNecessary(Cond, DAG);
17130 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17132 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17133 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17134 (isNullConstant(Op1) || isNullConstant(Op2))) {
17135 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17136 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17138 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17139 return DAG.getNOT(DL, Res, Res.getValueType());
17144 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17145 // widen the cmov and push the truncate through. This avoids introducing a new
17146 // branch during isel and doesn't add any extensions.
17147 if (Op.getValueType() == MVT::i8 &&
17148 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17149 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17150 if (T1.getValueType() == T2.getValueType() &&
17151 // Blacklist CopyFromReg to avoid partial register stalls.
17152 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17153 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17154 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17155 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17159 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17160 // condition is true.
17161 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17162 SDValue Ops[] = { Op2, Op1, CC, Cond };
17163 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17166 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17167 const X86Subtarget &Subtarget,
17168 SelectionDAG &DAG) {
17169 MVT VT = Op->getSimpleValueType(0);
17170 SDValue In = Op->getOperand(0);
17171 MVT InVT = In.getSimpleValueType();
17172 MVT VTElt = VT.getVectorElementType();
17173 MVT InVTElt = InVT.getVectorElementType();
17177 if ((InVTElt == MVT::i1) &&
17178 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
17179 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
17181 ((Subtarget.hasBWI() && VT.is512BitVector() &&
17182 VTElt.getSizeInBits() <= 16)) ||
17184 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
17185 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
17187 ((Subtarget.hasDQI() && VT.is512BitVector() &&
17188 VTElt.getSizeInBits() >= 32))))
17189 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17191 unsigned NumElts = VT.getVectorNumElements();
17193 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
17196 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
17197 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17198 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
17199 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17202 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
17203 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17204 SDValue NegOne = DAG.getConstant(
17205 APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
17206 SDValue Zero = DAG.getConstant(
17207 APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
17209 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17210 if (VT.is512BitVector())
17212 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17215 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17216 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17217 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17218 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17219 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17220 const X86Subtarget &Subtarget,
17221 SelectionDAG &DAG) {
17222 SDValue In = Op->getOperand(0);
17223 MVT VT = Op->getSimpleValueType(0);
17224 MVT InVT = In.getSimpleValueType();
17225 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17227 MVT SVT = VT.getVectorElementType();
17228 MVT InSVT = InVT.getVectorElementType();
17229 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17231 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17233 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17235 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17236 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17237 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17242 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17243 // For 512-bit vectors, we need 128-bits or 256-bits.
17244 if (VT.getSizeInBits() > 128) {
17245 // Input needs to be at least the same number of elements as output, and
17246 // at least 128-bits.
17247 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17248 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17251 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17252 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17254 // SSE41 targets can use the pmovsx* instructions directly.
17255 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17256 X86ISD::VSEXT : X86ISD::VZEXT;
17257 if (Subtarget.hasSSE41())
17258 return DAG.getNode(ExtOpc, dl, VT, In);
17260 // We should only get here for sign extend.
17261 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17262 "Unexpected opcode!");
17264 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17268 // As SRAI is only available on i16/i32 types, we expand only up to i32
17269 // and handle i64 separately.
17270 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17271 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17272 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17273 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17274 Curr = DAG.getBitcast(CurrVT, Curr);
17277 SDValue SignExt = Curr;
17278 if (CurrVT != InVT) {
17279 unsigned SignExtShift =
17280 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17281 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17282 DAG.getConstant(SignExtShift, dl, MVT::i8));
17288 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17289 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17290 DAG.getConstant(31, dl, MVT::i8));
17291 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17292 return DAG.getBitcast(VT, Ext);
17298 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17299 SelectionDAG &DAG) {
17300 MVT VT = Op->getSimpleValueType(0);
17301 SDValue In = Op->getOperand(0);
17302 MVT InVT = In.getSimpleValueType();
17305 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17306 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17308 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17309 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17310 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17313 if (Subtarget.hasInt256())
17314 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17316 // Optimize vectors in AVX mode
17317 // Sign extend v8i16 to v8i32 and
17320 // Divide input vector into two parts
17321 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17322 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17323 // concat the vectors to original VT
17325 unsigned NumElems = InVT.getVectorNumElements();
17326 SDValue Undef = DAG.getUNDEF(InVT);
17328 SmallVector<int,8> ShufMask1(NumElems, -1);
17329 for (unsigned i = 0; i != NumElems/2; ++i)
17332 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17334 SmallVector<int,8> ShufMask2(NumElems, -1);
17335 for (unsigned i = 0; i != NumElems/2; ++i)
17336 ShufMask2[i] = i + NumElems/2;
17338 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17340 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17341 VT.getVectorNumElements() / 2);
17343 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
17344 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
17346 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17349 // Lower truncating store. We need a special lowering to vXi1 vectors
17350 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17351 SelectionDAG &DAG) {
17352 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17354 EVT MemVT = St->getMemoryVT();
17355 assert(St->isTruncatingStore() && "We only custom truncating store.");
17356 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17357 "Expected truncstore of i1 vector");
17359 SDValue Op = St->getValue();
17360 MVT OpVT = Op.getValueType().getSimpleVT();
17361 unsigned NumElts = OpVT.getVectorNumElements();
17362 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17364 // Truncate and store - everything is legal
17365 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17366 if (MemVT.getSizeInBits() < 8)
17367 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17368 DAG.getUNDEF(MVT::v8i1), Op,
17369 DAG.getIntPtrConstant(0, dl));
17370 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17371 St->getMemOperand());
17374 // A subset, assume that we have only AVX-512F
17375 if (NumElts <= 8) {
17377 // Extend to 8-elts vector
17378 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17379 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17380 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17382 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17383 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17384 St->getMemOperand());
17387 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17388 // Divide the vector into 2 parts and store each part separately
17389 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17390 DAG.getIntPtrConstant(0, dl));
17391 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17392 SDValue BasePtr = St->getBasePtr();
17393 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17394 St->getMemOperand());
17395 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17396 DAG.getIntPtrConstant(16, dl));
17397 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
17399 SDValue BasePtrHi =
17400 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17401 DAG.getConstant(2, dl, BasePtr.getValueType()));
17403 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
17404 BasePtrHi, St->getMemOperand());
17405 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
17408 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
17409 const X86Subtarget &Subtarget,
17410 SelectionDAG &DAG) {
17412 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17414 EVT MemVT = Ld->getMemoryVT();
17415 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
17416 "Expected i1 vector load");
17417 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
17418 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17419 MVT VT = Op.getValueType().getSimpleVT();
17420 unsigned NumElts = VT.getVectorNumElements();
17422 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17424 // Load and extend - everything is legal
17426 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
17428 Ld->getMemOperand());
17429 // Replace chain users with the new chain.
17430 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17431 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17432 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17433 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
17435 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17436 DAG.getIntPtrConstant(0, dl));
17438 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
17440 Ld->getMemOperand());
17441 // Replace chain users with the new chain.
17442 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17443 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17445 // Finally, do a normal sign-extend to the desired register.
17446 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
17449 if (NumElts <= 8) {
17450 // A subset, assume that we have only AVX-512F
17451 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
17452 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
17453 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
17455 Ld->getMemOperand());
17456 // Replace chain users with the new chain.
17457 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17458 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17460 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
17461 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
17464 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
17466 // we should take care to v4i1 and v2i1
17468 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17469 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
17470 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17471 DAG.getIntPtrConstant(0, dl));
17474 assert(VT == MVT::v32i8 && "Unexpected extload type");
17476 SmallVector<SDValue, 2> Chains;
17478 SDValue BasePtr = Ld->getBasePtr();
17479 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17481 Ld->getMemOperand());
17482 Chains.push_back(LoadLo.getValue(1));
17484 SDValue BasePtrHi =
17485 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17486 DAG.getConstant(2, dl, BasePtr.getValueType()));
17488 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17490 Ld->getMemOperand());
17491 Chains.push_back(LoadHi.getValue(1));
17492 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17493 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
17495 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
17496 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
17497 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
17500 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
17501 // may emit an illegal shuffle but the expansion is still better than scalar
17502 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
17503 // we'll emit a shuffle and a arithmetic shift.
17504 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
17505 // TODO: It is possible to support ZExt by zeroing the undef values during
17506 // the shuffle phase or after the shuffle.
17507 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
17508 SelectionDAG &DAG) {
17509 MVT RegVT = Op.getSimpleValueType();
17510 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
17511 assert(RegVT.isInteger() &&
17512 "We only custom lower integer vector sext loads.");
17514 // Nothing useful we can do without SSE2 shuffles.
17515 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
17517 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17519 EVT MemVT = Ld->getMemoryVT();
17520 if (MemVT.getScalarType() == MVT::i1)
17521 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
17523 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17524 unsigned RegSz = RegVT.getSizeInBits();
17526 ISD::LoadExtType Ext = Ld->getExtensionType();
17528 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
17529 && "Only anyext and sext are currently implemented.");
17530 assert(MemVT != RegVT && "Cannot extend to the same type");
17531 assert(MemVT.isVector() && "Must load a vector from memory");
17533 unsigned NumElems = RegVT.getVectorNumElements();
17534 unsigned MemSz = MemVT.getSizeInBits();
17535 assert(RegSz > MemSz && "Register size must be greater than the mem size");
17537 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
17538 // The only way in which we have a legal 256-bit vector result but not the
17539 // integer 256-bit operations needed to directly lower a sextload is if we
17540 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
17541 // a 128-bit vector and a normal sign_extend to 256-bits that should get
17542 // correctly legalized. We do this late to allow the canonical form of
17543 // sextload to persist throughout the rest of the DAG combiner -- it wants
17544 // to fold together any extensions it can, and so will fuse a sign_extend
17545 // of an sextload into a sextload targeting a wider value.
17547 if (MemSz == 128) {
17548 // Just switch this to a normal load.
17549 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
17550 "it must be a legal 128-bit vector "
17552 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
17553 Ld->getPointerInfo(), Ld->getAlignment(),
17554 Ld->getMemOperand()->getFlags());
17556 assert(MemSz < 128 &&
17557 "Can't extend a type wider than 128 bits to a 256 bit vector!");
17558 // Do an sext load to a 128-bit vector type. We want to use the same
17559 // number of elements, but elements half as wide. This will end up being
17560 // recursively lowered by this routine, but will succeed as we definitely
17561 // have all the necessary features if we're using AVX1.
17563 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
17564 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
17566 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
17567 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
17568 Ld->getMemOperand()->getFlags());
17571 // Replace chain users with the new chain.
17572 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17573 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17575 // Finally, do a normal sign-extend to the desired register.
17576 return DAG.getSExtOrTrunc(Load, dl, RegVT);
17579 // All sizes must be a power of two.
17580 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
17581 "Non-power-of-two elements are not custom lowered!");
17583 // Attempt to load the original value using scalar loads.
17584 // Find the largest scalar type that divides the total loaded size.
17585 MVT SclrLoadTy = MVT::i8;
17586 for (MVT Tp : MVT::integer_valuetypes()) {
17587 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
17592 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
17593 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
17595 SclrLoadTy = MVT::f64;
17597 // Calculate the number of scalar loads that we need to perform
17598 // in order to load our vector from memory.
17599 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
17601 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
17602 "Can only lower sext loads with a single scalar load!");
17604 unsigned loadRegZize = RegSz;
17605 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
17608 // Represent our vector as a sequence of elements which are the
17609 // largest scalar that we can load.
17610 EVT LoadUnitVecVT = EVT::getVectorVT(
17611 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
17613 // Represent the data using the same element type that is stored in
17614 // memory. In practice, we ''widen'' MemVT.
17616 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
17617 loadRegZize / MemVT.getScalarSizeInBits());
17619 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
17620 "Invalid vector type");
17622 // We can't shuffle using an illegal type.
17623 assert(TLI.isTypeLegal(WideVecVT) &&
17624 "We only lower types that form legal widened vector types");
17626 SmallVector<SDValue, 8> Chains;
17627 SDValue Ptr = Ld->getBasePtr();
17628 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
17629 TLI.getPointerTy(DAG.getDataLayout()));
17630 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
17632 for (unsigned i = 0; i < NumLoads; ++i) {
17633 // Perform a single load.
17634 SDValue ScalarLoad =
17635 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
17636 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
17637 Chains.push_back(ScalarLoad.getValue(1));
17638 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
17639 // another round of DAGCombining.
17641 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
17643 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
17644 ScalarLoad, DAG.getIntPtrConstant(i, dl));
17646 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17649 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17651 // Bitcast the loaded value to a vector of the original element type, in
17652 // the size of the target vector type.
17653 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
17654 unsigned SizeRatio = RegSz / MemSz;
17656 if (Ext == ISD::SEXTLOAD) {
17657 // If we have SSE4.1, we can directly emit a VSEXT node.
17658 if (Subtarget.hasSSE41()) {
17659 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
17660 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17664 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
17666 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
17667 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
17669 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
17670 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17674 // Redistribute the loaded elements into the different locations.
17675 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
17676 for (unsigned i = 0; i != NumElems; ++i)
17677 ShuffleVec[i * SizeRatio] = i;
17679 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17680 DAG.getUNDEF(WideVecVT), ShuffleVec);
17682 // Bitcast to the requested type.
17683 Shuff = DAG.getBitcast(RegVT, Shuff);
17684 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17688 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
17689 /// each of which has no other use apart from the AND / OR.
17690 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
17691 Opc = Op.getOpcode();
17692 if (Opc != ISD::OR && Opc != ISD::AND)
17694 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17695 Op.getOperand(0).hasOneUse() &&
17696 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
17697 Op.getOperand(1).hasOneUse());
17700 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
17701 /// SETCC node has a single use.
17702 static bool isXor1OfSetCC(SDValue Op) {
17703 if (Op.getOpcode() != ISD::XOR)
17705 if (isOneConstant(Op.getOperand(1)))
17706 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17707 Op.getOperand(0).hasOneUse();
17711 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
17712 bool addTest = true;
17713 SDValue Chain = Op.getOperand(0);
17714 SDValue Cond = Op.getOperand(1);
17715 SDValue Dest = Op.getOperand(2);
17718 bool Inverted = false;
17720 if (Cond.getOpcode() == ISD::SETCC) {
17721 // Check for setcc([su]{add,sub,mul}o == 0).
17722 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
17723 isNullConstant(Cond.getOperand(1)) &&
17724 Cond.getOperand(0).getResNo() == 1 &&
17725 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
17726 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
17727 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
17728 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
17729 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
17730 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
17732 Cond = Cond.getOperand(0);
17734 if (SDValue NewCond = LowerSETCC(Cond, DAG))
17739 // FIXME: LowerXALUO doesn't handle these!!
17740 else if (Cond.getOpcode() == X86ISD::ADD ||
17741 Cond.getOpcode() == X86ISD::SUB ||
17742 Cond.getOpcode() == X86ISD::SMUL ||
17743 Cond.getOpcode() == X86ISD::UMUL)
17744 Cond = LowerXALUO(Cond, DAG);
17747 // Look pass (and (setcc_carry (cmp ...)), 1).
17748 if (Cond.getOpcode() == ISD::AND &&
17749 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17750 isOneConstant(Cond.getOperand(1)))
17751 Cond = Cond.getOperand(0);
17753 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17754 // setting operand in place of the X86ISD::SETCC.
17755 unsigned CondOpcode = Cond.getOpcode();
17756 if (CondOpcode == X86ISD::SETCC ||
17757 CondOpcode == X86ISD::SETCC_CARRY) {
17758 CC = Cond.getOperand(0);
17760 SDValue Cmp = Cond.getOperand(1);
17761 unsigned Opc = Cmp.getOpcode();
17762 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
17763 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
17767 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
17771 // These can only come from an arithmetic instruction with overflow,
17772 // e.g. SADDO, UADDO.
17773 Cond = Cond.getOperand(1);
17779 CondOpcode = Cond.getOpcode();
17780 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17781 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17782 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17783 Cond.getOperand(0).getValueType() != MVT::i8)) {
17784 SDValue LHS = Cond.getOperand(0);
17785 SDValue RHS = Cond.getOperand(1);
17786 unsigned X86Opcode;
17789 // Keep this in sync with LowerXALUO, otherwise we might create redundant
17790 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
17792 switch (CondOpcode) {
17793 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17795 if (isOneConstant(RHS)) {
17796 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
17799 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17800 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17802 if (isOneConstant(RHS)) {
17803 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
17806 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17807 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17808 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17809 default: llvm_unreachable("unexpected overflowing operator");
17812 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
17813 if (CondOpcode == ISD::UMULO)
17814 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17817 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17819 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
17821 if (CondOpcode == ISD::UMULO)
17822 Cond = X86Op.getValue(2);
17824 Cond = X86Op.getValue(1);
17826 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17830 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
17831 SDValue Cmp = Cond.getOperand(0).getOperand(1);
17832 if (CondOpc == ISD::OR) {
17833 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
17834 // two branches instead of an explicit OR instruction with a
17836 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17837 isX86LogicalCmp(Cmp)) {
17838 CC = Cond.getOperand(0).getOperand(0);
17839 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17840 Chain, Dest, CC, Cmp);
17841 CC = Cond.getOperand(1).getOperand(0);
17845 } else { // ISD::AND
17846 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
17847 // two branches instead of an explicit AND instruction with a
17848 // separate test. However, we only do this if this block doesn't
17849 // have a fall-through edge, because this requires an explicit
17850 // jmp when the condition is false.
17851 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17852 isX86LogicalCmp(Cmp) &&
17853 Op.getNode()->hasOneUse()) {
17854 X86::CondCode CCode =
17855 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17856 CCode = X86::GetOppositeBranchCondition(CCode);
17857 CC = DAG.getConstant(CCode, dl, MVT::i8);
17858 SDNode *User = *Op.getNode()->use_begin();
17859 // Look for an unconditional branch following this conditional branch.
17860 // We need this because we need to reverse the successors in order
17861 // to implement FCMP_OEQ.
17862 if (User->getOpcode() == ISD::BR) {
17863 SDValue FalseBB = User->getOperand(1);
17865 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17866 assert(NewBR == User);
17870 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17871 Chain, Dest, CC, Cmp);
17872 X86::CondCode CCode =
17873 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
17874 CCode = X86::GetOppositeBranchCondition(CCode);
17875 CC = DAG.getConstant(CCode, dl, MVT::i8);
17881 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
17882 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
17883 // It should be transformed during dag combiner except when the condition
17884 // is set by a arithmetics with overflow node.
17885 X86::CondCode CCode =
17886 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17887 CCode = X86::GetOppositeBranchCondition(CCode);
17888 CC = DAG.getConstant(CCode, dl, MVT::i8);
17889 Cond = Cond.getOperand(0).getOperand(1);
17891 } else if (Cond.getOpcode() == ISD::SETCC &&
17892 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
17893 // For FCMP_OEQ, we can emit
17894 // two branches instead of an explicit AND instruction with a
17895 // separate test. However, we only do this if this block doesn't
17896 // have a fall-through edge, because this requires an explicit
17897 // jmp when the condition is false.
17898 if (Op.getNode()->hasOneUse()) {
17899 SDNode *User = *Op.getNode()->use_begin();
17900 // Look for an unconditional branch following this conditional branch.
17901 // We need this because we need to reverse the successors in order
17902 // to implement FCMP_OEQ.
17903 if (User->getOpcode() == ISD::BR) {
17904 SDValue FalseBB = User->getOperand(1);
17906 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17907 assert(NewBR == User);
17911 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17912 Cond.getOperand(0), Cond.getOperand(1));
17913 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17914 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
17915 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17916 Chain, Dest, CC, Cmp);
17917 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
17922 } else if (Cond.getOpcode() == ISD::SETCC &&
17923 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
17924 // For FCMP_UNE, we can emit
17925 // two branches instead of an explicit AND instruction with a
17926 // separate test. However, we only do this if this block doesn't
17927 // have a fall-through edge, because this requires an explicit
17928 // jmp when the condition is false.
17929 if (Op.getNode()->hasOneUse()) {
17930 SDNode *User = *Op.getNode()->use_begin();
17931 // Look for an unconditional branch following this conditional branch.
17932 // We need this because we need to reverse the successors in order
17933 // to implement FCMP_UNE.
17934 if (User->getOpcode() == ISD::BR) {
17935 SDValue FalseBB = User->getOperand(1);
17937 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17938 assert(NewBR == User);
17941 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17942 Cond.getOperand(0), Cond.getOperand(1));
17943 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17944 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
17945 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17946 Chain, Dest, CC, Cmp);
17947 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
17957 // Look pass the truncate if the high bits are known zero.
17958 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17959 Cond = Cond.getOperand(0);
17961 // We know the result is compared against zero. Try to match it to BT.
17962 if (Cond.hasOneUse()) {
17963 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
17964 CC = NewSetCC.getOperand(0);
17965 Cond = NewSetCC.getOperand(1);
17972 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
17973 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17974 Cond = EmitTest(Cond, X86Cond, dl, DAG);
17976 Cond = ConvertCmpIfNecessary(Cond, DAG);
17977 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17978 Chain, Dest, CC, Cond);
17981 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
17982 // Calls to _alloca are needed to probe the stack when allocating more than 4k
17983 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
17984 // that the guard pages used by the OS virtual memory manager are allocated in
17985 // correct sequence.
17987 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17988 SelectionDAG &DAG) const {
17989 MachineFunction &MF = DAG.getMachineFunction();
17990 bool SplitStack = MF.shouldSplitStack();
17991 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
17996 SDNode *Node = Op.getNode();
17997 SDValue Chain = Op.getOperand(0);
17998 SDValue Size = Op.getOperand(1);
17999 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18000 EVT VT = Node->getValueType(0);
18002 // Chain the dynamic stack allocation so that it doesn't modify the stack
18003 // pointer when other instructions are using the stack.
18004 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18006 bool Is64Bit = Subtarget.is64Bit();
18007 MVT SPTy = getPointerTy(DAG.getDataLayout());
18011 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18012 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18013 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18014 " not tell us which reg is the stack pointer!");
18016 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18017 Chain = SP.getValue(1);
18018 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18019 unsigned StackAlign = TFI.getStackAlignment();
18020 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18021 if (Align > StackAlign)
18022 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18023 DAG.getConstant(-(uint64_t)Align, dl, VT));
18024 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18025 } else if (SplitStack) {
18026 MachineRegisterInfo &MRI = MF.getRegInfo();
18029 // The 64 bit implementation of segmented stacks needs to clobber both r10
18030 // r11. This makes it impossible to use it along with nested parameters.
18031 const Function *F = MF.getFunction();
18032 for (const auto &A : F->args()) {
18033 if (A.hasNestAttr())
18034 report_fatal_error("Cannot use segmented stacks with functions that "
18035 "have nested arguments.");
18039 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18040 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18041 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18042 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18043 DAG.getRegister(Vreg, SPTy));
18045 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18046 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18047 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18049 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18050 unsigned SPReg = RegInfo->getStackRegister();
18051 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18052 Chain = SP.getValue(1);
18055 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18056 DAG.getConstant(-(uint64_t)Align, dl, VT));
18057 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18063 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18064 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18066 SDValue Ops[2] = {Result, Chain};
18067 return DAG.getMergeValues(Ops, dl);
18070 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18071 MachineFunction &MF = DAG.getMachineFunction();
18072 auto PtrVT = getPointerTy(MF.getDataLayout());
18073 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18075 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18078 if (!Subtarget.is64Bit() ||
18079 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18080 // vastart just stores the address of the VarArgsFrameIndex slot into the
18081 // memory location argument.
18082 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18083 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18084 MachinePointerInfo(SV));
18088 // gp_offset (0 - 6 * 8)
18089 // fp_offset (48 - 48 + 8 * 16)
18090 // overflow_arg_area (point to parameters coming in memory).
18092 SmallVector<SDValue, 8> MemOps;
18093 SDValue FIN = Op.getOperand(1);
18095 SDValue Store = DAG.getStore(
18096 Op.getOperand(0), DL,
18097 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18098 MachinePointerInfo(SV));
18099 MemOps.push_back(Store);
18102 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18103 Store = DAG.getStore(
18104 Op.getOperand(0), DL,
18105 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18106 MachinePointerInfo(SV, 4));
18107 MemOps.push_back(Store);
18109 // Store ptr to overflow_arg_area
18110 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18111 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18113 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18114 MemOps.push_back(Store);
18116 // Store ptr to reg_save_area.
18117 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18118 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18119 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18120 Store = DAG.getStore(
18121 Op.getOperand(0), DL, RSFIN, FIN,
18122 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18123 MemOps.push_back(Store);
18124 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18127 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18128 assert(Subtarget.is64Bit() &&
18129 "LowerVAARG only handles 64-bit va_arg!");
18130 assert(Op.getNumOperands() == 4);
18132 MachineFunction &MF = DAG.getMachineFunction();
18133 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18134 // The Win64 ABI uses char* instead of a structure.
18135 return DAG.expandVAArg(Op.getNode());
18137 SDValue Chain = Op.getOperand(0);
18138 SDValue SrcPtr = Op.getOperand(1);
18139 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18140 unsigned Align = Op.getConstantOperandVal(3);
18143 EVT ArgVT = Op.getNode()->getValueType(0);
18144 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18145 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18148 // Decide which area this value should be read from.
18149 // TODO: Implement the AMD64 ABI in its entirety. This simple
18150 // selection mechanism works only for the basic types.
18151 if (ArgVT == MVT::f80) {
18152 llvm_unreachable("va_arg for f80 not yet implemented");
18153 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18154 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18155 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18156 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18158 llvm_unreachable("Unhandled argument type in LowerVAARG");
18161 if (ArgMode == 2) {
18162 // Sanity Check: Make sure using fp_offset makes sense.
18163 assert(!Subtarget.useSoftFloat() &&
18164 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18165 Subtarget.hasSSE1());
18168 // Insert VAARG_64 node into the DAG
18169 // VAARG_64 returns two values: Variable Argument Address, Chain
18170 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18171 DAG.getConstant(ArgMode, dl, MVT::i8),
18172 DAG.getConstant(Align, dl, MVT::i32)};
18173 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18174 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18175 VTs, InstOps, MVT::i64,
18176 MachinePointerInfo(SV),
18178 /*Volatile=*/false,
18180 /*WriteMem=*/true);
18181 Chain = VAARG.getValue(1);
18183 // Load the next argument and return it
18184 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18187 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18188 SelectionDAG &DAG) {
18189 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18190 // where a va_list is still an i8*.
18191 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18192 if (Subtarget.isCallingConvWin64(
18193 DAG.getMachineFunction().getFunction()->getCallingConv()))
18194 // Probably a Win64 va_copy.
18195 return DAG.expandVACopy(Op.getNode());
18197 SDValue Chain = Op.getOperand(0);
18198 SDValue DstPtr = Op.getOperand(1);
18199 SDValue SrcPtr = Op.getOperand(2);
18200 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18201 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18204 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18205 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18207 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18210 /// Handle vector element shifts where the shift amount is a constant.
18211 /// Takes immediate version of shift as input.
18212 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18213 SDValue SrcOp, uint64_t ShiftAmt,
18214 SelectionDAG &DAG) {
18215 MVT ElementType = VT.getVectorElementType();
18217 // Fold this packed shift into its first operand if ShiftAmt is 0.
18221 // Check for ShiftAmt >= element width
18222 if (ShiftAmt >= ElementType.getSizeInBits()) {
18223 if (Opc == X86ISD::VSRAI)
18224 ShiftAmt = ElementType.getSizeInBits() - 1;
18226 return DAG.getConstant(0, dl, VT);
18229 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18230 && "Unknown target vector shift-by-constant node");
18232 // Fold this packed vector shift into a build vector if SrcOp is a
18233 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
18234 if (VT == SrcOp.getSimpleValueType() &&
18235 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18236 SmallVector<SDValue, 8> Elts;
18237 unsigned NumElts = SrcOp->getNumOperands();
18238 ConstantSDNode *ND;
18241 default: llvm_unreachable("Unknown opcode!");
18242 case X86ISD::VSHLI:
18243 for (unsigned i=0; i!=NumElts; ++i) {
18244 SDValue CurrentOp = SrcOp->getOperand(i);
18245 if (CurrentOp->isUndef()) {
18246 Elts.push_back(CurrentOp);
18249 ND = cast<ConstantSDNode>(CurrentOp);
18250 const APInt &C = ND->getAPIntValue();
18251 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18254 case X86ISD::VSRLI:
18255 for (unsigned i=0; i!=NumElts; ++i) {
18256 SDValue CurrentOp = SrcOp->getOperand(i);
18257 if (CurrentOp->isUndef()) {
18258 Elts.push_back(CurrentOp);
18261 ND = cast<ConstantSDNode>(CurrentOp);
18262 const APInt &C = ND->getAPIntValue();
18263 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18266 case X86ISD::VSRAI:
18267 for (unsigned i=0; i!=NumElts; ++i) {
18268 SDValue CurrentOp = SrcOp->getOperand(i);
18269 if (CurrentOp->isUndef()) {
18270 Elts.push_back(CurrentOp);
18273 ND = cast<ConstantSDNode>(CurrentOp);
18274 const APInt &C = ND->getAPIntValue();
18275 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18280 return DAG.getBuildVector(VT, dl, Elts);
18283 return DAG.getNode(Opc, dl, VT, SrcOp,
18284 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18287 /// Handle vector element shifts where the shift amount may or may not be a
18288 /// constant. Takes immediate version of shift as input.
18289 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18290 SDValue SrcOp, SDValue ShAmt,
18291 const X86Subtarget &Subtarget,
18292 SelectionDAG &DAG) {
18293 MVT SVT = ShAmt.getSimpleValueType();
18294 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18296 // Catch shift-by-constant.
18297 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18298 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18299 CShAmt->getZExtValue(), DAG);
18301 // Change opcode to non-immediate version
18303 default: llvm_unreachable("Unknown target vector shift node");
18304 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18305 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18306 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18309 // Need to build a vector containing shift amount.
18310 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18311 // +=================+============+=======================================+
18312 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18313 // +=================+============+=======================================+
18314 // | i64 | Yes, No | Use ShAmt as lowest elt |
18315 // | i32 | Yes | zero-extend in-reg |
18316 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18317 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18318 // +=================+============+=======================================+
18320 if (SVT == MVT::i64)
18321 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18322 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18323 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18324 SDValue Op0 = ShAmt.getOperand(0);
18325 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
18326 ShAmt = DAG.getZeroExtendVectorInReg(Op0, SDLoc(Op0), MVT::v2i64);
18327 } else if (Subtarget.hasSSE41() &&
18328 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18329 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18330 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
18332 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18333 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18334 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18337 // The return type has to be a 128-bit type with the same element
18338 // type as the input type.
18339 MVT EltVT = VT.getVectorElementType();
18340 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18342 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18343 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18346 /// \brief Return Mask with the necessary casting or extending
18347 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18348 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18349 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18352 if (isAllOnesConstant(Mask))
18353 return DAG.getTargetConstant(1, dl, MaskVT);
18354 if (X86::isZeroNode(Mask))
18355 return DAG.getTargetConstant(0, dl, MaskVT);
18357 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18358 // Mask should be extended
18359 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18360 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18363 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18364 if (MaskVT == MVT::v64i1) {
18365 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18366 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18368 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18369 DAG.getConstant(0, dl, MVT::i32));
18370 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18371 DAG.getConstant(1, dl, MVT::i32));
18373 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18374 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18376 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18378 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18380 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18381 return DAG.getBitcast(MaskVT,
18382 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18386 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18387 Mask.getSimpleValueType().getSizeInBits());
18388 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18389 // are extracted by EXTRACT_SUBVECTOR.
18390 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18391 DAG.getBitcast(BitcastVT, Mask),
18392 DAG.getIntPtrConstant(0, dl));
18396 /// \brief Return (and \p Op, \p Mask) for compare instructions or
18397 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
18398 /// necessary casting or extending for \p Mask when lowering masking intrinsics
18399 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
18400 SDValue PreservedSrc,
18401 const X86Subtarget &Subtarget,
18402 SelectionDAG &DAG) {
18403 MVT VT = Op.getSimpleValueType();
18404 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18405 unsigned OpcodeSelect = ISD::VSELECT;
18408 if (isAllOnesConstant(Mask))
18411 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18413 switch (Op.getOpcode()) {
18415 case X86ISD::PCMPEQM:
18416 case X86ISD::PCMPGTM:
18418 case X86ISD::CMPMU:
18419 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
18420 case X86ISD::VFPCLASS:
18421 case X86ISD::VFPCLASSS:
18422 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
18423 case X86ISD::VTRUNC:
18424 case X86ISD::VTRUNCS:
18425 case X86ISD::VTRUNCUS:
18426 case X86ISD::CVTPS2PH:
18427 // We can't use ISD::VSELECT here because it is not always "Legal"
18428 // for the destination type. For example vpmovqb require only AVX512
18429 // and vselect that can operate on byte element type require BWI
18430 OpcodeSelect = X86ISD::SELECT;
18433 if (PreservedSrc.isUndef())
18434 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18435 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
18438 /// \brief Creates an SDNode for a predicated scalar operation.
18439 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
18440 /// The mask is coming as MVT::i8 and it should be truncated
18441 /// to MVT::i1 while lowering masking intrinsics.
18442 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
18443 /// "X86select" instead of "vselect". We just can't create the "vselect" node
18444 /// for a scalar instruction.
18445 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
18446 SDValue PreservedSrc,
18447 const X86Subtarget &Subtarget,
18448 SelectionDAG &DAG) {
18449 if (isAllOnesConstant(Mask))
18452 MVT VT = Op.getSimpleValueType();
18454 // The mask should be of type MVT::i1
18455 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
18457 if (Op.getOpcode() == X86ISD::FSETCCM ||
18458 Op.getOpcode() == X86ISD::FSETCCM_RND)
18459 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
18460 if (Op.getOpcode() == X86ISD::VFPCLASS ||
18461 Op.getOpcode() == X86ISD::VFPCLASSS)
18462 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
18464 if (PreservedSrc.isUndef())
18465 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18466 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
18469 static int getSEHRegistrationNodeSize(const Function *Fn) {
18470 if (!Fn->hasPersonalityFn())
18471 report_fatal_error(
18472 "querying registration node size for function without personality");
18473 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
18474 // WinEHStatePass for the full struct definition.
18475 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
18476 case EHPersonality::MSVC_X86SEH: return 24;
18477 case EHPersonality::MSVC_CXX: return 16;
18480 report_fatal_error(
18481 "can only recover FP for 32-bit MSVC EH personality functions");
18484 /// When the MSVC runtime transfers control to us, either to an outlined
18485 /// function or when returning to a parent frame after catching an exception, we
18486 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
18487 /// Here's the math:
18488 /// RegNodeBase = EntryEBP - RegNodeSize
18489 /// ParentFP = RegNodeBase - ParentFrameOffset
18490 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
18491 /// subtracting the offset (negative on x86) takes us back to the parent FP.
18492 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
18493 SDValue EntryEBP) {
18494 MachineFunction &MF = DAG.getMachineFunction();
18497 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18498 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18500 // It's possible that the parent function no longer has a personality function
18501 // if the exceptional code was optimized away, in which case we just return
18502 // the incoming EBP.
18503 if (!Fn->hasPersonalityFn())
18506 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
18507 // registration, or the .set_setframe offset.
18508 MCSymbol *OffsetSym =
18509 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
18510 GlobalValue::getRealLinkageName(Fn->getName()));
18511 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
18512 SDValue ParentFrameOffset =
18513 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
18515 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
18516 // prologue to RBP in the parent function.
18517 const X86Subtarget &Subtarget =
18518 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18519 if (Subtarget.is64Bit())
18520 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
18522 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
18523 // RegNodeBase = EntryEBP - RegNodeSize
18524 // ParentFP = RegNodeBase - ParentFrameOffset
18525 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
18526 DAG.getConstant(RegNodeSize, dl, PtrVT));
18527 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
18530 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18531 SelectionDAG &DAG) {
18532 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
18533 auto isRoundModeCurDirection = [](SDValue Rnd) {
18534 if (!isa<ConstantSDNode>(Rnd))
18537 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
18538 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
18542 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18543 MVT VT = Op.getSimpleValueType();
18544 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
18546 switch(IntrData->Type) {
18547 case INTR_TYPE_1OP:
18548 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
18549 case INTR_TYPE_2OP:
18550 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18552 case INTR_TYPE_3OP:
18553 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18554 Op.getOperand(2), Op.getOperand(3));
18555 case INTR_TYPE_4OP:
18556 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18557 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
18558 case INTR_TYPE_1OP_MASK_RM: {
18559 SDValue Src = Op.getOperand(1);
18560 SDValue PassThru = Op.getOperand(2);
18561 SDValue Mask = Op.getOperand(3);
18562 SDValue RoundingMode;
18563 // We always add rounding mode to the Node.
18564 // If the rounding mode is not specified, we add the
18565 // "current direction" mode.
18566 if (Op.getNumOperands() == 4)
18568 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18570 RoundingMode = Op.getOperand(4);
18571 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
18572 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18574 Mask, PassThru, Subtarget, DAG);
18576 case INTR_TYPE_1OP_MASK: {
18577 SDValue Src = Op.getOperand(1);
18578 SDValue PassThru = Op.getOperand(2);
18579 SDValue Mask = Op.getOperand(3);
18580 // We add rounding mode to the Node when
18581 // - RM Opcode is specified and
18582 // - RM is not "current direction".
18583 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18584 if (IntrWithRoundingModeOpcode != 0) {
18585 SDValue Rnd = Op.getOperand(4);
18586 if (!isRoundModeCurDirection(Rnd)) {
18587 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18588 dl, Op.getValueType(),
18590 Mask, PassThru, Subtarget, DAG);
18593 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
18594 Mask, PassThru, Subtarget, DAG);
18596 case INTR_TYPE_SCALAR_MASK: {
18597 SDValue Src1 = Op.getOperand(1);
18598 SDValue Src2 = Op.getOperand(2);
18599 SDValue passThru = Op.getOperand(3);
18600 SDValue Mask = Op.getOperand(4);
18601 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
18602 Mask, passThru, Subtarget, DAG);
18604 case INTR_TYPE_SCALAR_MASK_RM: {
18605 SDValue Src1 = Op.getOperand(1);
18606 SDValue Src2 = Op.getOperand(2);
18607 SDValue Src0 = Op.getOperand(3);
18608 SDValue Mask = Op.getOperand(4);
18609 // There are 2 kinds of intrinsics in this group:
18610 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
18611 // (2) With rounding mode and sae - 7 operands.
18612 if (Op.getNumOperands() == 6) {
18613 SDValue Sae = Op.getOperand(5);
18614 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18616 Mask, Src0, Subtarget, DAG);
18618 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
18619 SDValue RoundingMode = Op.getOperand(5);
18620 SDValue Sae = Op.getOperand(6);
18621 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18622 RoundingMode, Sae),
18623 Mask, Src0, Subtarget, DAG);
18625 case INTR_TYPE_2OP_MASK:
18626 case INTR_TYPE_2OP_IMM8_MASK: {
18627 SDValue Src1 = Op.getOperand(1);
18628 SDValue Src2 = Op.getOperand(2);
18629 SDValue PassThru = Op.getOperand(3);
18630 SDValue Mask = Op.getOperand(4);
18632 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
18633 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
18635 // We specify 2 possible opcodes for intrinsics with rounding modes.
18636 // First, we check if the intrinsic may have non-default rounding mode,
18637 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18638 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18639 if (IntrWithRoundingModeOpcode != 0) {
18640 SDValue Rnd = Op.getOperand(5);
18641 if (!isRoundModeCurDirection(Rnd)) {
18642 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18643 dl, Op.getValueType(),
18645 Mask, PassThru, Subtarget, DAG);
18648 // TODO: Intrinsics should have fast-math-flags to propagate.
18649 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
18650 Mask, PassThru, Subtarget, DAG);
18652 case INTR_TYPE_2OP_MASK_RM: {
18653 SDValue Src1 = Op.getOperand(1);
18654 SDValue Src2 = Op.getOperand(2);
18655 SDValue PassThru = Op.getOperand(3);
18656 SDValue Mask = Op.getOperand(4);
18657 // We specify 2 possible modes for intrinsics, with/without rounding
18659 // First, we check if the intrinsic have rounding mode (6 operands),
18660 // if not, we set rounding mode to "current".
18662 if (Op.getNumOperands() == 6)
18663 Rnd = Op.getOperand(5);
18665 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18666 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18668 Mask, PassThru, Subtarget, DAG);
18670 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
18671 SDValue Src1 = Op.getOperand(1);
18672 SDValue Src2 = Op.getOperand(2);
18673 SDValue Src3 = Op.getOperand(3);
18674 SDValue PassThru = Op.getOperand(4);
18675 SDValue Mask = Op.getOperand(5);
18676 SDValue Sae = Op.getOperand(6);
18678 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
18680 Mask, PassThru, Subtarget, DAG);
18682 case INTR_TYPE_3OP_MASK_RM: {
18683 SDValue Src1 = Op.getOperand(1);
18684 SDValue Src2 = Op.getOperand(2);
18685 SDValue Imm = Op.getOperand(3);
18686 SDValue PassThru = Op.getOperand(4);
18687 SDValue Mask = Op.getOperand(5);
18688 // We specify 2 possible modes for intrinsics, with/without rounding
18690 // First, we check if the intrinsic have rounding mode (7 operands),
18691 // if not, we set rounding mode to "current".
18693 if (Op.getNumOperands() == 7)
18694 Rnd = Op.getOperand(6);
18696 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18697 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18698 Src1, Src2, Imm, Rnd),
18699 Mask, PassThru, Subtarget, DAG);
18701 case INTR_TYPE_3OP_IMM8_MASK:
18702 case INTR_TYPE_3OP_MASK: {
18703 SDValue Src1 = Op.getOperand(1);
18704 SDValue Src2 = Op.getOperand(2);
18705 SDValue Src3 = Op.getOperand(3);
18706 SDValue PassThru = Op.getOperand(4);
18707 SDValue Mask = Op.getOperand(5);
18709 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
18710 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
18712 // We specify 2 possible opcodes for intrinsics with rounding modes.
18713 // First, we check if the intrinsic may have non-default rounding mode,
18714 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18715 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18716 if (IntrWithRoundingModeOpcode != 0) {
18717 SDValue Rnd = Op.getOperand(6);
18718 if (!isRoundModeCurDirection(Rnd)) {
18719 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18720 dl, Op.getValueType(),
18721 Src1, Src2, Src3, Rnd),
18722 Mask, PassThru, Subtarget, DAG);
18725 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18727 Mask, PassThru, Subtarget, DAG);
18729 case VPERM_2OP_MASK : {
18730 SDValue Src1 = Op.getOperand(1);
18731 SDValue Src2 = Op.getOperand(2);
18732 SDValue PassThru = Op.getOperand(3);
18733 SDValue Mask = Op.getOperand(4);
18735 // Swap Src1 and Src2 in the node creation
18736 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
18737 Mask, PassThru, Subtarget, DAG);
18739 case VPERM_3OP_MASKZ:
18740 case VPERM_3OP_MASK:{
18741 MVT VT = Op.getSimpleValueType();
18742 // Src2 is the PassThru
18743 SDValue Src1 = Op.getOperand(1);
18744 // PassThru needs to be the same type as the destination in order
18745 // to pattern match correctly.
18746 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
18747 SDValue Src3 = Op.getOperand(3);
18748 SDValue Mask = Op.getOperand(4);
18749 SDValue PassThru = SDValue();
18751 // set PassThru element
18752 if (IntrData->Type == VPERM_3OP_MASKZ)
18753 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18757 // Swap Src1 and Src2 in the node creation
18758 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18759 dl, Op.getValueType(),
18761 Mask, PassThru, Subtarget, DAG);
18765 case FMA_OP_MASK: {
18766 SDValue Src1 = Op.getOperand(1);
18767 SDValue Src2 = Op.getOperand(2);
18768 SDValue Src3 = Op.getOperand(3);
18769 SDValue Mask = Op.getOperand(4);
18770 MVT VT = Op.getSimpleValueType();
18771 SDValue PassThru = SDValue();
18773 // set PassThru element
18774 if (IntrData->Type == FMA_OP_MASKZ)
18775 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18776 else if (IntrData->Type == FMA_OP_MASK3)
18781 // We specify 2 possible opcodes for intrinsics with rounding modes.
18782 // First, we check if the intrinsic may have non-default rounding mode,
18783 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18784 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18785 if (IntrWithRoundingModeOpcode != 0) {
18786 SDValue Rnd = Op.getOperand(5);
18787 if (!isRoundModeCurDirection(Rnd))
18788 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18789 dl, Op.getValueType(),
18790 Src1, Src2, Src3, Rnd),
18791 Mask, PassThru, Subtarget, DAG);
18793 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18794 dl, Op.getValueType(),
18796 Mask, PassThru, Subtarget, DAG);
18798 case FMA_OP_SCALAR_MASK:
18799 case FMA_OP_SCALAR_MASK3:
18800 case FMA_OP_SCALAR_MASKZ: {
18801 SDValue Src1 = Op.getOperand(1);
18802 SDValue Src2 = Op.getOperand(2);
18803 SDValue Src3 = Op.getOperand(3);
18804 SDValue Mask = Op.getOperand(4);
18805 MVT VT = Op.getSimpleValueType();
18806 SDValue PassThru = SDValue();
18808 // set PassThru element
18809 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
18810 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18811 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
18816 SDValue Rnd = Op.getOperand(5);
18817 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
18818 Op.getValueType(), Src1, Src2,
18820 Mask, PassThru, Subtarget, DAG);
18822 case TERLOG_OP_MASK:
18823 case TERLOG_OP_MASKZ: {
18824 SDValue Src1 = Op.getOperand(1);
18825 SDValue Src2 = Op.getOperand(2);
18826 SDValue Src3 = Op.getOperand(3);
18827 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
18828 SDValue Mask = Op.getOperand(5);
18829 MVT VT = Op.getSimpleValueType();
18830 SDValue PassThru = Src1;
18831 // Set PassThru element.
18832 if (IntrData->Type == TERLOG_OP_MASKZ)
18833 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18835 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18836 Src1, Src2, Src3, Src4),
18837 Mask, PassThru, Subtarget, DAG);
18840 // ISD::FP_ROUND has a second argument that indicates if the truncation
18841 // does not change the value. Set it to 0 since it can change.
18842 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
18843 DAG.getIntPtrConstant(0, dl));
18844 case CVTPD2PS_MASK: {
18845 SDValue Src = Op.getOperand(1);
18846 SDValue PassThru = Op.getOperand(2);
18847 SDValue Mask = Op.getOperand(3);
18848 // We add rounding mode to the Node when
18849 // - RM Opcode is specified and
18850 // - RM is not "current direction".
18851 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18852 if (IntrWithRoundingModeOpcode != 0) {
18853 SDValue Rnd = Op.getOperand(4);
18854 if (!isRoundModeCurDirection(Rnd)) {
18855 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18856 dl, Op.getValueType(),
18858 Mask, PassThru, Subtarget, DAG);
18861 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
18862 // ISD::FP_ROUND has a second argument that indicates if the truncation
18863 // does not change the value. Set it to 0 since it can change.
18864 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18865 DAG.getIntPtrConstant(0, dl)),
18866 Mask, PassThru, Subtarget, DAG);
18869 // FPclass intrinsics with mask
18870 SDValue Src1 = Op.getOperand(1);
18871 MVT VT = Src1.getSimpleValueType();
18872 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18873 SDValue Imm = Op.getOperand(2);
18874 SDValue Mask = Op.getOperand(3);
18875 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18876 Mask.getSimpleValueType().getSizeInBits());
18877 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
18878 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
18879 DAG.getTargetConstant(0, dl, MaskVT),
18881 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18882 DAG.getUNDEF(BitcastVT), FPclassMask,
18883 DAG.getIntPtrConstant(0, dl));
18884 return DAG.getBitcast(Op.getValueType(), Res);
18887 SDValue Src1 = Op.getOperand(1);
18888 SDValue Imm = Op.getOperand(2);
18889 SDValue Mask = Op.getOperand(3);
18890 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
18891 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
18892 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
18893 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
18896 case CMP_MASK_CC: {
18897 // Comparison intrinsics with masks.
18898 // Example of transformation:
18899 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
18900 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
18902 // (v8i1 (insert_subvector undef,
18903 // (v2i1 (and (PCMPEQM %a, %b),
18904 // (extract_subvector
18905 // (v8i1 (bitcast %mask)), 0))), 0))))
18906 MVT VT = Op.getOperand(1).getSimpleValueType();
18907 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18908 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
18909 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18910 Mask.getSimpleValueType().getSizeInBits());
18912 if (IntrData->Type == CMP_MASK_CC) {
18913 SDValue CC = Op.getOperand(3);
18914 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
18915 // We specify 2 possible opcodes for intrinsics with rounding modes.
18916 // First, we check if the intrinsic may have non-default rounding mode,
18917 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18918 if (IntrData->Opc1 != 0) {
18919 SDValue Rnd = Op.getOperand(5);
18920 if (!isRoundModeCurDirection(Rnd))
18921 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
18922 Op.getOperand(2), CC, Rnd);
18924 //default rounding mode
18926 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
18927 Op.getOperand(2), CC);
18930 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
18931 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
18934 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
18935 DAG.getTargetConstant(0, dl,
18938 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18939 DAG.getUNDEF(BitcastVT), CmpMask,
18940 DAG.getIntPtrConstant(0, dl));
18941 return DAG.getBitcast(Op.getValueType(), Res);
18943 case CMP_MASK_SCALAR_CC: {
18944 SDValue Src1 = Op.getOperand(1);
18945 SDValue Src2 = Op.getOperand(2);
18946 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
18947 SDValue Mask = Op.getOperand(4);
18950 if (IntrData->Opc1 != 0) {
18951 SDValue Rnd = Op.getOperand(5);
18952 if (!isRoundModeCurDirection(Rnd))
18953 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
18955 //default rounding mode
18957 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
18959 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
18960 DAG.getTargetConstant(0, dl,
18964 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
18966 case COMI: { // Comparison intrinsics
18967 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
18968 SDValue LHS = Op.getOperand(1);
18969 SDValue RHS = Op.getOperand(2);
18970 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
18971 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
18974 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
18975 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
18976 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
18977 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
18980 case ISD::SETNE: { // (ZF = 1 or PF = 1)
18981 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
18982 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
18983 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
18986 case ISD::SETGT: // (CF = 0 and ZF = 0)
18987 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
18989 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
18990 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
18993 case ISD::SETGE: // CF = 0
18994 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
18996 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
18997 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19000 llvm_unreachable("Unexpected illegal condition!");
19002 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19004 case COMI_RM: { // Comparison intrinsics with Sae
19005 SDValue LHS = Op.getOperand(1);
19006 SDValue RHS = Op.getOperand(2);
19007 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19008 SDValue Sae = Op.getOperand(4);
19011 if (isRoundModeCurDirection(Sae))
19012 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19013 DAG.getConstant(CondVal, dl, MVT::i8));
19015 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19016 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19017 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19018 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19021 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19022 Op.getOperand(1), Op.getOperand(2), Subtarget,
19024 case COMPRESS_EXPAND_IN_REG: {
19025 SDValue Mask = Op.getOperand(3);
19026 SDValue DataToCompress = Op.getOperand(1);
19027 SDValue PassThru = Op.getOperand(2);
19028 if (isAllOnesConstant(Mask)) // return data as is
19029 return Op.getOperand(1);
19031 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19033 Mask, PassThru, Subtarget, DAG);
19036 SDValue Mask = Op.getOperand(1);
19037 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19038 Mask.getSimpleValueType().getSizeInBits());
19039 Mask = DAG.getBitcast(MaskVT, Mask);
19040 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19043 MVT VT = Op.getSimpleValueType();
19044 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19046 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19047 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19048 // Arguments should be swapped.
19049 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19050 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19052 return DAG.getBitcast(VT, Res);
19055 case FIXUPIMMS_MASKZ:
19057 case FIXUPIMM_MASKZ:{
19058 SDValue Src1 = Op.getOperand(1);
19059 SDValue Src2 = Op.getOperand(2);
19060 SDValue Src3 = Op.getOperand(3);
19061 SDValue Imm = Op.getOperand(4);
19062 SDValue Mask = Op.getOperand(5);
19063 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19064 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19065 // We specify 2 possible modes for intrinsics, with/without rounding
19067 // First, we check if the intrinsic have rounding mode (7 operands),
19068 // if not, we set rounding mode to "current".
19070 if (Op.getNumOperands() == 7)
19071 Rnd = Op.getOperand(6);
19073 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19074 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19075 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19076 Src1, Src2, Src3, Imm, Rnd),
19077 Mask, Passthru, Subtarget, DAG);
19078 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19079 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19080 Src1, Src2, Src3, Imm, Rnd),
19081 Mask, Passthru, Subtarget, DAG);
19083 case CONVERT_TO_MASK: {
19084 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19085 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19086 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19088 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19090 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19091 DAG.getUNDEF(BitcastVT), CvtMask,
19092 DAG.getIntPtrConstant(0, dl));
19093 return DAG.getBitcast(Op.getValueType(), Res);
19095 case CONVERT_MASK_TO_VEC: {
19096 SDValue Mask = Op.getOperand(1);
19097 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19098 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19099 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19101 case BRCST_SUBVEC_TO_VEC: {
19102 SDValue Src = Op.getOperand(1);
19103 SDValue Passthru = Op.getOperand(2);
19104 SDValue Mask = Op.getOperand(3);
19105 EVT resVT = Passthru.getValueType();
19106 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19107 DAG.getUNDEF(resVT), Src,
19108 DAG.getIntPtrConstant(0, dl));
19110 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19111 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19113 immVal = DAG.getConstant(0, dl, MVT::i8);
19114 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19115 subVec, subVec, immVal),
19116 Mask, Passthru, Subtarget, DAG);
19118 case BRCST32x2_TO_VEC: {
19119 SDValue Src = Op.getOperand(1);
19120 SDValue PassThru = Op.getOperand(2);
19121 SDValue Mask = Op.getOperand(3);
19123 assert((VT.getScalarType() == MVT::i32 ||
19124 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19125 //bitcast Src to packed 64
19126 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19127 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19128 Src = DAG.getBitcast(BitcastVT, Src);
19130 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19131 Mask, PassThru, Subtarget, DAG);
19139 default: return SDValue(); // Don't custom lower most intrinsics.
19141 case Intrinsic::x86_avx2_permd:
19142 case Intrinsic::x86_avx2_permps:
19143 // Operands intentionally swapped. Mask is last operand to intrinsic,
19144 // but second operand for node/instruction.
19145 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19146 Op.getOperand(2), Op.getOperand(1));
19148 // ptest and testp intrinsics. The intrinsic these come from are designed to
19149 // return an integer value, not just an instruction so lower it to the ptest
19150 // or testp pattern and a setcc for the result.
19151 case Intrinsic::x86_sse41_ptestz:
19152 case Intrinsic::x86_sse41_ptestc:
19153 case Intrinsic::x86_sse41_ptestnzc:
19154 case Intrinsic::x86_avx_ptestz_256:
19155 case Intrinsic::x86_avx_ptestc_256:
19156 case Intrinsic::x86_avx_ptestnzc_256:
19157 case Intrinsic::x86_avx_vtestz_ps:
19158 case Intrinsic::x86_avx_vtestc_ps:
19159 case Intrinsic::x86_avx_vtestnzc_ps:
19160 case Intrinsic::x86_avx_vtestz_pd:
19161 case Intrinsic::x86_avx_vtestc_pd:
19162 case Intrinsic::x86_avx_vtestnzc_pd:
19163 case Intrinsic::x86_avx_vtestz_ps_256:
19164 case Intrinsic::x86_avx_vtestc_ps_256:
19165 case Intrinsic::x86_avx_vtestnzc_ps_256:
19166 case Intrinsic::x86_avx_vtestz_pd_256:
19167 case Intrinsic::x86_avx_vtestc_pd_256:
19168 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19169 bool IsTestPacked = false;
19170 X86::CondCode X86CC;
19172 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19173 case Intrinsic::x86_avx_vtestz_ps:
19174 case Intrinsic::x86_avx_vtestz_pd:
19175 case Intrinsic::x86_avx_vtestz_ps_256:
19176 case Intrinsic::x86_avx_vtestz_pd_256:
19177 IsTestPacked = true;
19179 case Intrinsic::x86_sse41_ptestz:
19180 case Intrinsic::x86_avx_ptestz_256:
19182 X86CC = X86::COND_E;
19184 case Intrinsic::x86_avx_vtestc_ps:
19185 case Intrinsic::x86_avx_vtestc_pd:
19186 case Intrinsic::x86_avx_vtestc_ps_256:
19187 case Intrinsic::x86_avx_vtestc_pd_256:
19188 IsTestPacked = true;
19190 case Intrinsic::x86_sse41_ptestc:
19191 case Intrinsic::x86_avx_ptestc_256:
19193 X86CC = X86::COND_B;
19195 case Intrinsic::x86_avx_vtestnzc_ps:
19196 case Intrinsic::x86_avx_vtestnzc_pd:
19197 case Intrinsic::x86_avx_vtestnzc_ps_256:
19198 case Intrinsic::x86_avx_vtestnzc_pd_256:
19199 IsTestPacked = true;
19201 case Intrinsic::x86_sse41_ptestnzc:
19202 case Intrinsic::x86_avx_ptestnzc_256:
19204 X86CC = X86::COND_A;
19208 SDValue LHS = Op.getOperand(1);
19209 SDValue RHS = Op.getOperand(2);
19210 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19211 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19212 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19213 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19215 case Intrinsic::x86_avx512_kortestz_w:
19216 case Intrinsic::x86_avx512_kortestc_w: {
19217 X86::CondCode X86CC =
19218 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19219 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19220 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19221 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19222 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19223 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19226 case Intrinsic::x86_sse42_pcmpistria128:
19227 case Intrinsic::x86_sse42_pcmpestria128:
19228 case Intrinsic::x86_sse42_pcmpistric128:
19229 case Intrinsic::x86_sse42_pcmpestric128:
19230 case Intrinsic::x86_sse42_pcmpistrio128:
19231 case Intrinsic::x86_sse42_pcmpestrio128:
19232 case Intrinsic::x86_sse42_pcmpistris128:
19233 case Intrinsic::x86_sse42_pcmpestris128:
19234 case Intrinsic::x86_sse42_pcmpistriz128:
19235 case Intrinsic::x86_sse42_pcmpestriz128: {
19237 X86::CondCode X86CC;
19239 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19240 case Intrinsic::x86_sse42_pcmpistria128:
19241 Opcode = X86ISD::PCMPISTRI;
19242 X86CC = X86::COND_A;
19244 case Intrinsic::x86_sse42_pcmpestria128:
19245 Opcode = X86ISD::PCMPESTRI;
19246 X86CC = X86::COND_A;
19248 case Intrinsic::x86_sse42_pcmpistric128:
19249 Opcode = X86ISD::PCMPISTRI;
19250 X86CC = X86::COND_B;
19252 case Intrinsic::x86_sse42_pcmpestric128:
19253 Opcode = X86ISD::PCMPESTRI;
19254 X86CC = X86::COND_B;
19256 case Intrinsic::x86_sse42_pcmpistrio128:
19257 Opcode = X86ISD::PCMPISTRI;
19258 X86CC = X86::COND_O;
19260 case Intrinsic::x86_sse42_pcmpestrio128:
19261 Opcode = X86ISD::PCMPESTRI;
19262 X86CC = X86::COND_O;
19264 case Intrinsic::x86_sse42_pcmpistris128:
19265 Opcode = X86ISD::PCMPISTRI;
19266 X86CC = X86::COND_S;
19268 case Intrinsic::x86_sse42_pcmpestris128:
19269 Opcode = X86ISD::PCMPESTRI;
19270 X86CC = X86::COND_S;
19272 case Intrinsic::x86_sse42_pcmpistriz128:
19273 Opcode = X86ISD::PCMPISTRI;
19274 X86CC = X86::COND_E;
19276 case Intrinsic::x86_sse42_pcmpestriz128:
19277 Opcode = X86ISD::PCMPESTRI;
19278 X86CC = X86::COND_E;
19281 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19282 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19283 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19284 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19285 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19288 case Intrinsic::x86_sse42_pcmpistri128:
19289 case Intrinsic::x86_sse42_pcmpestri128: {
19291 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19292 Opcode = X86ISD::PCMPISTRI;
19294 Opcode = X86ISD::PCMPESTRI;
19296 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19297 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19298 return DAG.getNode(Opcode, dl, VTs, NewOps);
19301 case Intrinsic::eh_sjlj_lsda: {
19302 MachineFunction &MF = DAG.getMachineFunction();
19303 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19304 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19305 auto &Context = MF.getMMI().getContext();
19306 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19307 Twine(MF.getFunctionNumber()));
19308 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19311 case Intrinsic::x86_seh_lsda: {
19312 // Compute the symbol for the LSDA. We know it'll get emitted later.
19313 MachineFunction &MF = DAG.getMachineFunction();
19314 SDValue Op1 = Op.getOperand(1);
19315 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19316 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19317 GlobalValue::getRealLinkageName(Fn->getName()));
19319 // Generate a simple absolute symbol reference. This intrinsic is only
19320 // supported on 32-bit Windows, which isn't PIC.
19321 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19322 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19325 case Intrinsic::x86_seh_recoverfp: {
19326 SDValue FnOp = Op.getOperand(1);
19327 SDValue IncomingFPOp = Op.getOperand(2);
19328 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19329 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19331 report_fatal_error(
19332 "llvm.x86.seh.recoverfp must take a function as the first argument");
19333 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19336 case Intrinsic::localaddress: {
19337 // Returns one of the stack, base, or frame pointer registers, depending on
19338 // which is used to reference local variables.
19339 MachineFunction &MF = DAG.getMachineFunction();
19340 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19342 if (RegInfo->hasBasePointer(MF))
19343 Reg = RegInfo->getBaseRegister();
19344 else // This function handles the SP or FP case.
19345 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19346 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
19351 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19352 SDValue Src, SDValue Mask, SDValue Base,
19353 SDValue Index, SDValue ScaleOp, SDValue Chain,
19354 const X86Subtarget &Subtarget) {
19356 auto *C = cast<ConstantSDNode>(ScaleOp);
19357 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19358 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19359 Index.getSimpleValueType().getVectorNumElements());
19361 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19362 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
19363 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19364 SDValue Segment = DAG.getRegister(0, MVT::i32);
19366 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
19367 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
19368 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19369 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
19370 return DAG.getMergeValues(RetOps, dl);
19373 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19374 SDValue Src, SDValue Mask, SDValue Base,
19375 SDValue Index, SDValue ScaleOp, SDValue Chain,
19376 const X86Subtarget &Subtarget) {
19378 auto *C = cast<ConstantSDNode>(ScaleOp);
19379 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19380 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19381 SDValue Segment = DAG.getRegister(0, MVT::i32);
19382 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19383 Index.getSimpleValueType().getVectorNumElements());
19385 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19386 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
19387 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
19388 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19389 return SDValue(Res, 1);
19392 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19393 SDValue Mask, SDValue Base, SDValue Index,
19394 SDValue ScaleOp, SDValue Chain,
19395 const X86Subtarget &Subtarget) {
19397 auto *C = cast<ConstantSDNode>(ScaleOp);
19398 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19399 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19400 SDValue Segment = DAG.getRegister(0, MVT::i32);
19402 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
19403 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19404 //SDVTList VTs = DAG.getVTList(MVT::Other);
19405 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
19406 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
19407 return SDValue(Res, 0);
19410 /// Handles the lowering of builtin intrinsic that return the value
19411 /// of the extended control register.
19412 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
19414 const X86Subtarget &Subtarget,
19415 SmallVectorImpl<SDValue> &Results) {
19416 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19417 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19420 // The ECX register is used to select the index of the XCR register to
19423 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
19424 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
19425 Chain = SDValue(N1, 0);
19427 // Reads the content of XCR and returns it in registers EDX:EAX.
19428 if (Subtarget.is64Bit()) {
19429 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
19430 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19433 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
19434 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19437 Chain = HI.getValue(1);
19439 if (Subtarget.is64Bit()) {
19440 // Merge the two 32-bit values into a 64-bit one..
19441 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19442 DAG.getConstant(32, DL, MVT::i8));
19443 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19444 Results.push_back(Chain);
19448 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19449 SDValue Ops[] = { LO, HI };
19450 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19451 Results.push_back(Pair);
19452 Results.push_back(Chain);
19455 /// Handles the lowering of builtin intrinsics that read performance monitor
19456 /// counters (x86_rdpmc).
19457 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
19459 const X86Subtarget &Subtarget,
19460 SmallVectorImpl<SDValue> &Results) {
19461 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19462 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19465 // The ECX register is used to select the index of the performance counter
19467 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
19469 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
19471 // Reads the content of a 64-bit performance counter and returns it in the
19472 // registers EDX:EAX.
19473 if (Subtarget.is64Bit()) {
19474 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19475 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19478 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19479 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19482 Chain = HI.getValue(1);
19484 if (Subtarget.is64Bit()) {
19485 // The EAX register is loaded with the low-order 32 bits. The EDX register
19486 // is loaded with the supported high-order bits of the counter.
19487 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19488 DAG.getConstant(32, DL, MVT::i8));
19489 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19490 Results.push_back(Chain);
19494 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19495 SDValue Ops[] = { LO, HI };
19496 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19497 Results.push_back(Pair);
19498 Results.push_back(Chain);
19501 /// Handles the lowering of builtin intrinsics that read the time stamp counter
19502 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
19503 /// READCYCLECOUNTER nodes.
19504 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
19506 const X86Subtarget &Subtarget,
19507 SmallVectorImpl<SDValue> &Results) {
19508 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19509 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
19512 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
19513 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
19514 // and the EAX register is loaded with the low-order 32 bits.
19515 if (Subtarget.is64Bit()) {
19516 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19517 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19520 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19521 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19524 SDValue Chain = HI.getValue(1);
19526 if (Opcode == X86ISD::RDTSCP_DAG) {
19527 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19529 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
19530 // the ECX register. Add 'ecx' explicitly to the chain.
19531 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
19533 // Explicitly store the content of ECX at the location passed in input
19534 // to the 'rdtscp' intrinsic.
19535 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
19536 MachinePointerInfo());
19539 if (Subtarget.is64Bit()) {
19540 // The EDX register is loaded with the high-order 32 bits of the MSR, and
19541 // the EAX register is loaded with the low-order 32 bits.
19542 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19543 DAG.getConstant(32, DL, MVT::i8));
19544 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19545 Results.push_back(Chain);
19549 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19550 SDValue Ops[] = { LO, HI };
19551 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19552 Results.push_back(Pair);
19553 Results.push_back(Chain);
19556 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
19557 SelectionDAG &DAG) {
19558 SmallVector<SDValue, 2> Results;
19560 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
19562 return DAG.getMergeValues(Results, DL);
19565 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
19566 MachineFunction &MF = DAG.getMachineFunction();
19567 SDValue Chain = Op.getOperand(0);
19568 SDValue RegNode = Op.getOperand(2);
19569 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19571 report_fatal_error("EH registrations only live in functions using WinEH");
19573 // Cast the operand to an alloca, and remember the frame index.
19574 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
19576 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
19577 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
19579 // Return the chain operand without making any DAG nodes.
19583 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
19584 MachineFunction &MF = DAG.getMachineFunction();
19585 SDValue Chain = Op.getOperand(0);
19586 SDValue EHGuard = Op.getOperand(2);
19587 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19589 report_fatal_error("EHGuard only live in functions using WinEH");
19591 // Cast the operand to an alloca, and remember the frame index.
19592 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
19594 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
19595 EHInfo->EHGuardFrameIndex = FINode->getIndex();
19597 // Return the chain operand without making any DAG nodes.
19601 /// Emit Truncating Store with signed or unsigned saturation.
19603 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
19604 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
19605 SelectionDAG &DAG) {
19607 SDVTList VTs = DAG.getVTList(MVT::Other);
19608 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
19609 SDValue Ops[] = { Chain, Val, Ptr, Undef };
19611 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19612 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19615 /// Emit Masked Truncating Store with signed or unsigned saturation.
19617 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
19618 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
19619 MachineMemOperand *MMO, SelectionDAG &DAG) {
19621 SDVTList VTs = DAG.getVTList(MVT::Other);
19622 SDValue Ops[] = { Chain, Ptr, Mask, Val };
19624 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19625 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19628 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19629 SelectionDAG &DAG) {
19630 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
19632 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
19634 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
19635 return MarkEHRegistrationNode(Op, DAG);
19636 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
19637 return MarkEHGuard(Op, DAG);
19638 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
19639 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
19640 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
19641 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
19642 // We need a frame pointer because this will get lowered to a PUSH/POP
19644 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19645 MFI.setHasCopyImplyingStackAdjustment(true);
19646 // Don't do anything here, we will expand these intrinsics out later
19647 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
19654 switch(IntrData->Type) {
19655 default: llvm_unreachable("Unknown Intrinsic Type");
19658 // Emit the node with the right value type.
19659 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
19660 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19662 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
19663 // Otherwise return the value from Rand, which is always 0, casted to i32.
19664 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
19665 DAG.getConstant(1, dl, Op->getValueType(1)),
19666 DAG.getConstant(X86::COND_B, dl, MVT::i32),
19667 SDValue(Result.getNode(), 1) };
19668 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
19669 DAG.getVTList(Op->getValueType(1), MVT::Glue),
19672 // Return { result, isValid, chain }.
19673 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
19674 SDValue(Result.getNode(), 2));
19677 //gather(v1, mask, index, base, scale);
19678 SDValue Chain = Op.getOperand(0);
19679 SDValue Src = Op.getOperand(2);
19680 SDValue Base = Op.getOperand(3);
19681 SDValue Index = Op.getOperand(4);
19682 SDValue Mask = Op.getOperand(5);
19683 SDValue Scale = Op.getOperand(6);
19684 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
19688 //scatter(base, mask, index, v1, scale);
19689 SDValue Chain = Op.getOperand(0);
19690 SDValue Base = Op.getOperand(2);
19691 SDValue Mask = Op.getOperand(3);
19692 SDValue Index = Op.getOperand(4);
19693 SDValue Src = Op.getOperand(5);
19694 SDValue Scale = Op.getOperand(6);
19695 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
19696 Scale, Chain, Subtarget);
19699 SDValue Hint = Op.getOperand(6);
19700 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
19701 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
19702 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
19703 SDValue Chain = Op.getOperand(0);
19704 SDValue Mask = Op.getOperand(2);
19705 SDValue Index = Op.getOperand(3);
19706 SDValue Base = Op.getOperand(4);
19707 SDValue Scale = Op.getOperand(5);
19708 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
19711 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
19713 SmallVector<SDValue, 2> Results;
19714 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
19716 return DAG.getMergeValues(Results, dl);
19718 // Read Performance Monitoring Counters.
19720 SmallVector<SDValue, 2> Results;
19721 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
19722 return DAG.getMergeValues(Results, dl);
19724 // Get Extended Control Register.
19726 SmallVector<SDValue, 2> Results;
19727 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
19728 return DAG.getMergeValues(Results, dl);
19730 // XTEST intrinsics.
19732 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19733 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19735 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
19736 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
19737 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
19738 Ret, SDValue(InTrans.getNode(), 1));
19742 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19743 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
19744 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
19745 DAG.getConstant(-1, dl, MVT::i8));
19746 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
19747 Op.getOperand(4), GenCF.getValue(1));
19748 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
19749 Op.getOperand(5), MachinePointerInfo());
19750 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
19751 SDValue Results[] = { SetCC, Store };
19752 return DAG.getMergeValues(Results, dl);
19754 case COMPRESS_TO_MEM: {
19755 SDValue Mask = Op.getOperand(4);
19756 SDValue DataToCompress = Op.getOperand(3);
19757 SDValue Addr = Op.getOperand(2);
19758 SDValue Chain = Op.getOperand(0);
19759 MVT VT = DataToCompress.getSimpleValueType();
19761 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19762 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19764 if (isAllOnesConstant(Mask)) // return just a store
19765 return DAG.getStore(Chain, dl, DataToCompress, Addr,
19766 MemIntr->getMemOperand());
19768 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19769 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19771 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
19772 MemIntr->getMemOperand(),
19773 false /* truncating */, true /* compressing */);
19775 case TRUNCATE_TO_MEM_VI8:
19776 case TRUNCATE_TO_MEM_VI16:
19777 case TRUNCATE_TO_MEM_VI32: {
19778 SDValue Mask = Op.getOperand(4);
19779 SDValue DataToTruncate = Op.getOperand(3);
19780 SDValue Addr = Op.getOperand(2);
19781 SDValue Chain = Op.getOperand(0);
19783 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19784 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19786 EVT MemVT = MemIntr->getMemoryVT();
19788 uint16_t TruncationOp = IntrData->Opc0;
19789 switch (TruncationOp) {
19790 case X86ISD::VTRUNC: {
19791 if (isAllOnesConstant(Mask)) // return just a truncate store
19792 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
19793 MemIntr->getMemOperand());
19795 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19796 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19798 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
19799 MemIntr->getMemOperand(), true /* truncating */);
19801 case X86ISD::VTRUNCUS:
19802 case X86ISD::VTRUNCS: {
19803 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
19804 if (isAllOnesConstant(Mask))
19805 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
19806 MemIntr->getMemOperand(), DAG);
19808 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19809 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19811 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
19812 VMask, MemVT, MemIntr->getMemOperand(), DAG);
19815 llvm_unreachable("Unsupported truncstore intrinsic");
19819 case EXPAND_FROM_MEM: {
19820 SDValue Mask = Op.getOperand(4);
19821 SDValue PassThru = Op.getOperand(3);
19822 SDValue Addr = Op.getOperand(2);
19823 SDValue Chain = Op.getOperand(0);
19824 MVT VT = Op.getSimpleValueType();
19826 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19827 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19829 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
19830 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
19831 if (X86::isZeroNode(Mask))
19832 return DAG.getUNDEF(VT);
19834 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19835 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19836 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
19837 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
19838 true /* expanding */);
19843 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
19844 SelectionDAG &DAG) const {
19845 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19846 MFI.setReturnAddressIsTaken(true);
19848 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
19851 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19853 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19856 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
19857 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19858 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
19859 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
19860 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
19861 MachinePointerInfo());
19864 // Just load the return address.
19865 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
19866 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
19867 MachinePointerInfo());
19870 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
19871 SelectionDAG &DAG) const {
19872 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
19873 return getReturnAddressFrameIndex(DAG);
19876 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
19877 MachineFunction &MF = DAG.getMachineFunction();
19878 MachineFrameInfo &MFI = MF.getFrameInfo();
19879 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19880 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19881 EVT VT = Op.getValueType();
19883 MFI.setFrameAddressIsTaken(true);
19885 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
19886 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
19887 // is not possible to crawl up the stack without looking at the unwind codes
19889 int FrameAddrIndex = FuncInfo->getFAIndex();
19890 if (!FrameAddrIndex) {
19891 // Set up a frame object for the return address.
19892 unsigned SlotSize = RegInfo->getSlotSize();
19893 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
19894 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
19895 FuncInfo->setFAIndex(FrameAddrIndex);
19897 return DAG.getFrameIndex(FrameAddrIndex, VT);
19900 unsigned FrameReg =
19901 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19902 SDLoc dl(Op); // FIXME probably not meaningful
19903 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19904 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
19905 (FrameReg == X86::EBP && VT == MVT::i32)) &&
19906 "Invalid Frame Register!");
19907 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
19909 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
19910 MachinePointerInfo());
19914 // FIXME? Maybe this could be a TableGen attribute on some registers and
19915 // this table could be generated automatically from RegInfo.
19916 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
19917 SelectionDAG &DAG) const {
19918 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19919 const MachineFunction &MF = DAG.getMachineFunction();
19921 unsigned Reg = StringSwitch<unsigned>(RegName)
19922 .Case("esp", X86::ESP)
19923 .Case("rsp", X86::RSP)
19924 .Case("ebp", X86::EBP)
19925 .Case("rbp", X86::RBP)
19928 if (Reg == X86::EBP || Reg == X86::RBP) {
19929 if (!TFI.hasFP(MF))
19930 report_fatal_error("register " + StringRef(RegName) +
19931 " is allocatable: function has no frame pointer");
19934 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19935 unsigned FrameReg =
19936 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19937 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
19938 "Invalid Frame Register!");
19946 report_fatal_error("Invalid register name global variable");
19949 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
19950 SelectionDAG &DAG) const {
19951 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19952 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
19955 unsigned X86TargetLowering::getExceptionPointerRegister(
19956 const Constant *PersonalityFn) const {
19957 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
19958 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
19960 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
19963 unsigned X86TargetLowering::getExceptionSelectorRegister(
19964 const Constant *PersonalityFn) const {
19965 // Funclet personalities don't use selectors (the runtime does the selection).
19966 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
19967 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
19970 bool X86TargetLowering::needsFixedCatchObjects() const {
19971 return Subtarget.isTargetWin64();
19974 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
19975 SDValue Chain = Op.getOperand(0);
19976 SDValue Offset = Op.getOperand(1);
19977 SDValue Handler = Op.getOperand(2);
19980 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19981 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19982 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
19983 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
19984 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
19985 "Invalid Frame Register!");
19986 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
19987 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
19989 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
19990 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
19992 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
19993 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
19994 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
19996 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
19997 DAG.getRegister(StoreAddrReg, PtrVT));
20000 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20001 SelectionDAG &DAG) const {
20003 // If the subtarget is not 64bit, we may need the global base reg
20004 // after isel expand pseudo, i.e., after CGBR pass ran.
20005 // Therefore, ask for the GlobalBaseReg now, so that the pass
20006 // inserts the code for us in case we need it.
20007 // Otherwise, we will end up in a situation where we will
20008 // reference a virtual register that is not defined!
20009 if (!Subtarget.is64Bit()) {
20010 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20011 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20013 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20014 DAG.getVTList(MVT::i32, MVT::Other),
20015 Op.getOperand(0), Op.getOperand(1));
20018 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20019 SelectionDAG &DAG) const {
20021 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20022 Op.getOperand(0), Op.getOperand(1));
20025 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20026 SelectionDAG &DAG) const {
20028 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20032 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20033 return Op.getOperand(0);
20036 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20037 SelectionDAG &DAG) const {
20038 SDValue Root = Op.getOperand(0);
20039 SDValue Trmp = Op.getOperand(1); // trampoline
20040 SDValue FPtr = Op.getOperand(2); // nested function
20041 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20044 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20045 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20047 if (Subtarget.is64Bit()) {
20048 SDValue OutChains[6];
20050 // Large code-model.
20051 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20052 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20054 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20055 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20057 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20059 // Load the pointer to the nested function into R11.
20060 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20061 SDValue Addr = Trmp;
20062 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20063 Addr, MachinePointerInfo(TrmpAddr));
20065 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20066 DAG.getConstant(2, dl, MVT::i64));
20068 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20069 /* Alignment = */ 2);
20071 // Load the 'nest' parameter value into R10.
20072 // R10 is specified in X86CallingConv.td
20073 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20074 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20075 DAG.getConstant(10, dl, MVT::i64));
20076 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20077 Addr, MachinePointerInfo(TrmpAddr, 10));
20079 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20080 DAG.getConstant(12, dl, MVT::i64));
20082 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20083 /* Alignment = */ 2);
20085 // Jump to the nested function.
20086 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20087 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20088 DAG.getConstant(20, dl, MVT::i64));
20089 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20090 Addr, MachinePointerInfo(TrmpAddr, 20));
20092 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20093 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20094 DAG.getConstant(22, dl, MVT::i64));
20095 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20096 Addr, MachinePointerInfo(TrmpAddr, 22));
20098 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20100 const Function *Func =
20101 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20102 CallingConv::ID CC = Func->getCallingConv();
20107 llvm_unreachable("Unsupported calling convention");
20108 case CallingConv::C:
20109 case CallingConv::X86_StdCall: {
20110 // Pass 'nest' parameter in ECX.
20111 // Must be kept in sync with X86CallingConv.td
20112 NestReg = X86::ECX;
20114 // Check that ECX wasn't needed by an 'inreg' parameter.
20115 FunctionType *FTy = Func->getFunctionType();
20116 const AttributeSet &Attrs = Func->getAttributes();
20118 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20119 unsigned InRegCount = 0;
20122 for (FunctionType::param_iterator I = FTy->param_begin(),
20123 E = FTy->param_end(); I != E; ++I, ++Idx)
20124 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20125 auto &DL = DAG.getDataLayout();
20126 // FIXME: should only count parameters that are lowered to integers.
20127 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20130 if (InRegCount > 2) {
20131 report_fatal_error("Nest register in use - reduce number of inreg"
20137 case CallingConv::X86_FastCall:
20138 case CallingConv::X86_ThisCall:
20139 case CallingConv::Fast:
20140 // Pass 'nest' parameter in EAX.
20141 // Must be kept in sync with X86CallingConv.td
20142 NestReg = X86::EAX;
20146 SDValue OutChains[4];
20147 SDValue Addr, Disp;
20149 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20150 DAG.getConstant(10, dl, MVT::i32));
20151 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20153 // This is storing the opcode for MOV32ri.
20154 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20155 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20157 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20158 Trmp, MachinePointerInfo(TrmpAddr));
20160 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20161 DAG.getConstant(1, dl, MVT::i32));
20163 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20164 /* Alignment = */ 1);
20166 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20167 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20168 DAG.getConstant(5, dl, MVT::i32));
20169 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20170 Addr, MachinePointerInfo(TrmpAddr, 5),
20171 /* Alignment = */ 1);
20173 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20174 DAG.getConstant(6, dl, MVT::i32));
20176 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20177 /* Alignment = */ 1);
20179 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20183 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20184 SelectionDAG &DAG) const {
20186 The rounding mode is in bits 11:10 of FPSR, and has the following
20188 00 Round to nearest
20193 FLT_ROUNDS, on the other hand, expects the following:
20200 To perform the conversion, we do:
20201 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20204 MachineFunction &MF = DAG.getMachineFunction();
20205 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20206 unsigned StackAlignment = TFI.getStackAlignment();
20207 MVT VT = Op.getSimpleValueType();
20210 // Save FP Control Word to stack slot
20211 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20212 SDValue StackSlot =
20213 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20215 MachineMemOperand *MMO =
20216 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20217 MachineMemOperand::MOStore, 2, 2);
20219 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20220 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20221 DAG.getVTList(MVT::Other),
20222 Ops, MVT::i16, MMO);
20224 // Load FP Control Word from stack slot
20226 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20228 // Transform as necessary
20230 DAG.getNode(ISD::SRL, DL, MVT::i16,
20231 DAG.getNode(ISD::AND, DL, MVT::i16,
20232 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20233 DAG.getConstant(11, DL, MVT::i8));
20235 DAG.getNode(ISD::SRL, DL, MVT::i16,
20236 DAG.getNode(ISD::AND, DL, MVT::i16,
20237 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20238 DAG.getConstant(9, DL, MVT::i8));
20241 DAG.getNode(ISD::AND, DL, MVT::i16,
20242 DAG.getNode(ISD::ADD, DL, MVT::i16,
20243 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20244 DAG.getConstant(1, DL, MVT::i16)),
20245 DAG.getConstant(3, DL, MVT::i16));
20247 return DAG.getNode((VT.getSizeInBits() < 16 ?
20248 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20251 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20253 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20254 // to 512-bit vector.
20255 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20256 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20257 // split the vector, perform operation on it's Lo a Hi part and
20258 // concatenate the results.
20259 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
20260 assert(Op.getOpcode() == ISD::CTLZ);
20262 MVT VT = Op.getSimpleValueType();
20263 MVT EltVT = VT.getVectorElementType();
20264 unsigned NumElems = VT.getVectorNumElements();
20266 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
20267 // Extend to 512 bit vector.
20268 assert((VT.is256BitVector() || VT.is128BitVector()) &&
20269 "Unsupported value type for operation");
20271 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
20272 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
20273 DAG.getUNDEF(NewVT),
20275 DAG.getIntPtrConstant(0, dl));
20276 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
20278 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
20279 DAG.getIntPtrConstant(0, dl));
20282 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20283 "Unsupported element type");
20285 if (16 < NumElems) {
20286 // Split vector, it's Lo and Hi parts will be handled in next iteration.
20288 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
20289 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
20291 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
20292 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
20294 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
20297 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
20299 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
20300 "Unsupported value type for operation");
20302 // Use native supported vector instruction vplzcntd.
20303 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
20304 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
20305 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
20306 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
20308 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
20311 // Lower CTLZ using a PSHUFB lookup table implementation.
20312 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
20313 const X86Subtarget &Subtarget,
20314 SelectionDAG &DAG) {
20315 MVT VT = Op.getSimpleValueType();
20316 int NumElts = VT.getVectorNumElements();
20317 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
20318 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
20320 // Per-nibble leading zero PSHUFB lookup table.
20321 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
20322 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
20323 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
20324 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
20326 SmallVector<SDValue, 64> LUTVec;
20327 for (int i = 0; i < NumBytes; ++i)
20328 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20329 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
20331 // Begin by bitcasting the input to byte vector, then split those bytes
20332 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
20333 // If the hi input nibble is zero then we add both results together, otherwise
20334 // we just take the hi result (by masking the lo result to zero before the
20336 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
20337 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
20339 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
20340 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
20341 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
20342 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
20343 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
20345 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
20346 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
20347 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
20348 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
20350 // Merge result back from vXi8 back to VT, working on the lo/hi halves
20351 // of the current vector width in the same way we did for the nibbles.
20352 // If the upper half of the input element is zero then add the halves'
20353 // leading zero counts together, otherwise just use the upper half's.
20354 // Double the width of the result until we are at target width.
20355 while (CurrVT != VT) {
20356 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
20357 int CurrNumElts = CurrVT.getVectorNumElements();
20358 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
20359 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
20360 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
20362 // Check if the upper half of the input element is zero.
20363 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
20364 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
20365 HiZ = DAG.getBitcast(NextVT, HiZ);
20367 // Move the upper/lower halves to the lower bits as we'll be extending to
20368 // NextVT. Mask the lower result to zero if HiZ is true and add the results
20370 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
20371 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
20372 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
20373 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
20374 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
20381 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
20382 const X86Subtarget &Subtarget,
20383 SelectionDAG &DAG) {
20384 MVT VT = Op.getSimpleValueType();
20385 SDValue Op0 = Op.getOperand(0);
20387 if (Subtarget.hasAVX512())
20388 return LowerVectorCTLZ_AVX512(Op, DAG);
20390 // Decompose 256-bit ops into smaller 128-bit ops.
20391 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20392 unsigned NumElems = VT.getVectorNumElements();
20394 // Extract each 128-bit vector, perform ctlz and concat the result.
20395 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
20396 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
20398 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
20399 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
20400 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
20403 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
20404 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
20407 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
20408 SelectionDAG &DAG) {
20409 MVT VT = Op.getSimpleValueType();
20411 unsigned NumBits = VT.getSizeInBits();
20413 unsigned Opc = Op.getOpcode();
20416 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
20418 Op = Op.getOperand(0);
20419 if (VT == MVT::i8) {
20420 // Zero extend to i32 since there is not an i8 bsr.
20422 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
20425 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
20426 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
20427 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
20429 if (Opc == ISD::CTLZ) {
20430 // If src is zero (i.e. bsr sets ZF), returns NumBits.
20433 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
20434 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20437 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
20440 // Finally xor with NumBits-1.
20441 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
20442 DAG.getConstant(NumBits - 1, dl, OpVT));
20445 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
20449 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
20450 MVT VT = Op.getSimpleValueType();
20451 unsigned NumBits = VT.getScalarSizeInBits();
20454 if (VT.isVector()) {
20455 SDValue N0 = Op.getOperand(0);
20456 SDValue Zero = DAG.getConstant(0, dl, VT);
20458 // lsb(x) = (x & -x)
20459 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
20460 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
20462 // cttz_undef(x) = (width - 1) - ctlz(lsb)
20463 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
20464 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
20465 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
20466 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
20469 // cttz(x) = ctpop(lsb - 1)
20470 SDValue One = DAG.getConstant(1, dl, VT);
20471 return DAG.getNode(ISD::CTPOP, dl, VT,
20472 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
20475 assert(Op.getOpcode() == ISD::CTTZ &&
20476 "Only scalar CTTZ requires custom lowering");
20478 // Issue a bsf (scan bits forward) which also sets EFLAGS.
20479 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
20480 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
20482 // If src is zero (i.e. bsf sets ZF), returns NumBits.
20485 DAG.getConstant(NumBits, dl, VT),
20486 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20489 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
20492 /// Break a 256-bit integer operation into two new 128-bit ones and then
20493 /// concatenate the result back.
20494 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
20495 MVT VT = Op.getSimpleValueType();
20497 assert(VT.is256BitVector() && VT.isInteger() &&
20498 "Unsupported value type for operation");
20500 unsigned NumElems = VT.getVectorNumElements();
20503 // Extract the LHS vectors
20504 SDValue LHS = Op.getOperand(0);
20505 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
20506 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
20508 // Extract the RHS vectors
20509 SDValue RHS = Op.getOperand(1);
20510 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
20511 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
20513 MVT EltVT = VT.getVectorElementType();
20514 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20516 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20517 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20518 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20521 /// Break a 512-bit integer operation into two new 256-bit ones and then
20522 /// concatenate the result back.
20523 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
20524 MVT VT = Op.getSimpleValueType();
20526 assert(VT.is512BitVector() && VT.isInteger() &&
20527 "Unsupported value type for operation");
20529 unsigned NumElems = VT.getVectorNumElements();
20532 // Extract the LHS vectors
20533 SDValue LHS = Op.getOperand(0);
20534 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
20535 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
20537 // Extract the RHS vectors
20538 SDValue RHS = Op.getOperand(1);
20539 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
20540 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
20542 MVT EltVT = VT.getVectorElementType();
20543 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20545 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20546 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20547 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20550 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
20551 if (Op.getValueType() == MVT::i1)
20552 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20553 Op.getOperand(0), Op.getOperand(1));
20554 assert(Op.getSimpleValueType().is256BitVector() &&
20555 Op.getSimpleValueType().isInteger() &&
20556 "Only handle AVX 256-bit vector integer operation");
20557 return Lower256IntArith(Op, DAG);
20560 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
20561 if (Op.getValueType() == MVT::i1)
20562 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20563 Op.getOperand(0), Op.getOperand(1));
20564 assert(Op.getSimpleValueType().is256BitVector() &&
20565 Op.getSimpleValueType().isInteger() &&
20566 "Only handle AVX 256-bit vector integer operation");
20567 return Lower256IntArith(Op, DAG);
20570 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
20571 assert(Op.getSimpleValueType().is256BitVector() &&
20572 Op.getSimpleValueType().isInteger() &&
20573 "Only handle AVX 256-bit vector integer operation");
20574 return Lower256IntArith(Op, DAG);
20577 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
20578 SelectionDAG &DAG) {
20580 MVT VT = Op.getSimpleValueType();
20583 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
20585 // Decompose 256-bit ops into smaller 128-bit ops.
20586 if (VT.is256BitVector() && !Subtarget.hasInt256())
20587 return Lower256IntArith(Op, DAG);
20589 SDValue A = Op.getOperand(0);
20590 SDValue B = Op.getOperand(1);
20592 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
20593 // vector pairs, multiply and truncate.
20594 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
20595 if (Subtarget.hasInt256()) {
20596 // For 512-bit vectors, split into 256-bit vectors to allow the
20597 // sign-extension to occur.
20598 if (VT == MVT::v64i8)
20599 return Lower512IntArith(Op, DAG);
20601 // For 256-bit vectors, split into 128-bit vectors to allow the
20602 // sign-extension to occur. We don't need this on AVX512BW as we can
20603 // safely sign-extend to v32i16.
20604 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
20605 return Lower256IntArith(Op, DAG);
20607 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
20608 return DAG.getNode(
20609 ISD::TRUNCATE, dl, VT,
20610 DAG.getNode(ISD::MUL, dl, ExVT,
20611 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
20612 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
20615 assert(VT == MVT::v16i8 &&
20616 "Pre-AVX2 support only supports v16i8 multiplication");
20617 MVT ExVT = MVT::v8i16;
20619 // Extract the lo parts and sign extend to i16
20621 if (Subtarget.hasSSE41()) {
20622 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
20623 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
20625 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20626 -1, 4, -1, 5, -1, 6, -1, 7};
20627 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20628 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20629 ALo = DAG.getBitcast(ExVT, ALo);
20630 BLo = DAG.getBitcast(ExVT, BLo);
20631 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20632 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20635 // Extract the hi parts and sign extend to i16
20637 if (Subtarget.hasSSE41()) {
20638 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20639 -1, -1, -1, -1, -1, -1, -1, -1};
20640 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20641 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20642 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
20643 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
20645 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20646 -1, 12, -1, 13, -1, 14, -1, 15};
20647 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20648 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20649 AHi = DAG.getBitcast(ExVT, AHi);
20650 BHi = DAG.getBitcast(ExVT, BHi);
20651 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20652 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20655 // Multiply, mask the lower 8bits of the lo/hi results and pack
20656 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20657 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20658 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
20659 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
20660 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20663 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
20664 if (VT == MVT::v4i32) {
20665 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
20666 "Should not custom lower when pmuldq is available!");
20668 // Extract the odd parts.
20669 static const int UnpackMask[] = { 1, -1, 3, -1 };
20670 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
20671 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
20673 // Multiply the even parts.
20674 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
20675 // Now multiply odd parts.
20676 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
20678 Evens = DAG.getBitcast(VT, Evens);
20679 Odds = DAG.getBitcast(VT, Odds);
20681 // Merge the two vectors back together with a shuffle. This expands into 2
20683 static const int ShufMask[] = { 0, 4, 2, 6 };
20684 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
20687 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
20688 "Only know how to lower V2I64/V4I64/V8I64 multiply");
20690 // 32-bit vector types used for MULDQ/MULUDQ.
20691 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20693 // MULDQ returns the 64-bit result of the signed multiplication of the lower
20694 // 32-bits. We can lower with this if the sign bits stretch that far.
20695 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
20696 DAG.ComputeNumSignBits(B) > 32) {
20697 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
20698 DAG.getBitcast(MulVT, B));
20701 // Ahi = psrlqi(a, 32);
20702 // Bhi = psrlqi(b, 32);
20704 // AloBlo = pmuludq(a, b);
20705 // AloBhi = pmuludq(a, Bhi);
20706 // AhiBlo = pmuludq(Ahi, b);
20708 // Hi = psllqi(AloBhi + AhiBlo, 32);
20709 // return AloBlo + Hi;
20710 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
20711 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
20712 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
20714 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
20715 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
20716 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
20718 // Bit cast to 32-bit vectors for MULUDQ.
20719 SDValue Alo = DAG.getBitcast(MulVT, A);
20720 SDValue Blo = DAG.getBitcast(MulVT, B);
20722 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20724 // Only multiply lo/hi halves that aren't known to be zero.
20725 SDValue AloBlo = Zero;
20726 if (!ALoIsZero && !BLoIsZero)
20727 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
20729 SDValue AloBhi = Zero;
20730 if (!ALoIsZero && !BHiIsZero) {
20731 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
20732 Bhi = DAG.getBitcast(MulVT, Bhi);
20733 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
20736 SDValue AhiBlo = Zero;
20737 if (!AHiIsZero && !BLoIsZero) {
20738 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
20739 Ahi = DAG.getBitcast(MulVT, Ahi);
20740 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
20743 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
20744 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
20746 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
20749 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
20750 SelectionDAG &DAG) {
20752 MVT VT = Op.getSimpleValueType();
20754 // Decompose 256-bit ops into smaller 128-bit ops.
20755 if (VT.is256BitVector() && !Subtarget.hasInt256())
20756 return Lower256IntArith(Op, DAG);
20758 // Only i8 vectors should need custom lowering after this.
20759 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
20760 "Unsupported vector type");
20762 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
20763 // logical shift down the upper half and pack back to i8.
20764 SDValue A = Op.getOperand(0);
20765 SDValue B = Op.getOperand(1);
20767 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
20768 // and then ashr/lshr the upper bits down to the lower bits before multiply.
20769 unsigned Opcode = Op.getOpcode();
20770 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
20771 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
20773 // AVX2 implementations - extend xmm subvectors to ymm.
20774 if (Subtarget.hasInt256()) {
20775 SDValue Lo = DAG.getIntPtrConstant(0, dl);
20776 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
20778 if (VT == MVT::v32i8) {
20779 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
20780 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
20781 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
20782 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
20783 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
20784 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
20785 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
20786 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
20787 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20788 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
20789 DAG.getConstant(8, dl, MVT::v16i16));
20790 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20791 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
20792 DAG.getConstant(8, dl, MVT::v16i16));
20793 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
20794 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
20795 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
20796 16, 17, 18, 19, 20, 21, 22, 23};
20797 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20798 24, 25, 26, 27, 28, 29, 30, 31};
20799 return DAG.getNode(X86ISD::PACKUS, dl, VT,
20800 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
20801 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
20804 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
20805 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
20806 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
20807 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
20808 DAG.getConstant(8, dl, MVT::v16i16));
20809 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
20810 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
20811 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20814 assert(VT == MVT::v16i8 &&
20815 "Pre-AVX2 support only supports v16i8 multiplication");
20816 MVT ExVT = MVT::v8i16;
20818 // Extract the lo parts and zero/sign extend to i16.
20820 if (Subtarget.hasSSE41()) {
20821 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
20822 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
20824 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20825 -1, 4, -1, 5, -1, 6, -1, 7};
20826 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20827 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20828 ALo = DAG.getBitcast(ExVT, ALo);
20829 BLo = DAG.getBitcast(ExVT, BLo);
20830 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20831 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20834 // Extract the hi parts and zero/sign extend to i16.
20836 if (Subtarget.hasSSE41()) {
20837 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20838 -1, -1, -1, -1, -1, -1, -1, -1};
20839 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20840 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20841 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
20842 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
20844 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20845 -1, 12, -1, 13, -1, 14, -1, 15};
20846 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20847 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20848 AHi = DAG.getBitcast(ExVT, AHi);
20849 BHi = DAG.getBitcast(ExVT, BHi);
20850 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20851 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20854 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
20855 // pack back to v16i8.
20856 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20857 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20858 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
20859 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
20860 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20863 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
20864 assert(Subtarget.isTargetWin64() && "Unexpected target");
20865 EVT VT = Op.getValueType();
20866 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
20867 "Unexpected return type for lowering");
20871 switch (Op->getOpcode()) {
20872 default: llvm_unreachable("Unexpected request for libcall!");
20873 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
20874 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
20875 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
20876 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
20877 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
20878 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
20882 SDValue InChain = DAG.getEntryNode();
20884 TargetLowering::ArgListTy Args;
20885 TargetLowering::ArgListEntry Entry;
20886 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
20887 EVT ArgVT = Op->getOperand(i).getValueType();
20888 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
20889 "Unexpected argument type for lowering");
20890 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
20891 Entry.Node = StackPtr;
20892 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
20893 MachinePointerInfo(), /* Alignment = */ 16);
20894 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20895 Entry.Ty = PointerType::get(ArgTy,0);
20896 Entry.isSExt = false;
20897 Entry.isZExt = false;
20898 Args.push_back(Entry);
20901 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
20902 getPointerTy(DAG.getDataLayout()));
20904 TargetLowering::CallLoweringInfo CLI(DAG);
20905 CLI.setDebugLoc(dl).setChain(InChain)
20906 .setCallee(getLibcallCallingConv(LC),
20907 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
20908 Callee, std::move(Args))
20909 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
20911 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20912 return DAG.getBitcast(VT, CallInfo.first);
20915 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
20916 SelectionDAG &DAG) {
20917 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
20918 MVT VT = Op0.getSimpleValueType();
20921 // Decompose 256-bit ops into smaller 128-bit ops.
20922 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20923 unsigned Opcode = Op.getOpcode();
20924 unsigned NumElems = VT.getVectorNumElements();
20925 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
20926 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
20927 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
20928 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
20929 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
20930 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
20931 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
20933 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
20934 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
20936 return DAG.getMergeValues(Ops, dl);
20939 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
20940 (VT == MVT::v8i32 && Subtarget.hasInt256()));
20942 // PMULxD operations multiply each even value (starting at 0) of LHS with
20943 // the related value of RHS and produce a widen result.
20944 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
20945 // => <2 x i64> <ae|cg>
20947 // In other word, to have all the results, we need to perform two PMULxD:
20948 // 1. one with the even values.
20949 // 2. one with the odd values.
20950 // To achieve #2, with need to place the odd values at an even position.
20952 // Place the odd value at an even position (basically, shift all values 1
20953 // step to the left):
20954 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
20955 // <a|b|c|d> => <b|undef|d|undef>
20956 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
20957 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
20958 // <e|f|g|h> => <f|undef|h|undef>
20959 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
20960 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
20962 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
20964 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
20965 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
20967 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
20968 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
20969 // => <2 x i64> <ae|cg>
20970 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
20971 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
20972 // => <2 x i64> <bf|dh>
20973 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
20975 // Shuffle it back into the right order.
20976 SDValue Highs, Lows;
20977 if (VT == MVT::v8i32) {
20978 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
20979 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
20980 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
20981 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
20983 const int HighMask[] = {1, 5, 3, 7};
20984 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
20985 const int LowMask[] = {0, 4, 2, 6};
20986 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
20989 // If we have a signed multiply but no PMULDQ fix up the high parts of a
20990 // unsigned multiply.
20991 if (IsSigned && !Subtarget.hasSSE41()) {
20992 SDValue ShAmt = DAG.getConstant(
20994 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
20995 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
20996 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
20997 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
20998 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21000 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21001 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21004 // The first result of MUL_LOHI is actually the low value, followed by the
21006 SDValue Ops[] = {Lows, Highs};
21007 return DAG.getMergeValues(Ops, dl);
21010 // Return true if the required (according to Opcode) shift-imm form is natively
21011 // supported by the Subtarget
21012 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21014 if (VT.getScalarSizeInBits() < 16)
21017 if (VT.is512BitVector() &&
21018 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21021 bool LShift = VT.is128BitVector() ||
21022 (VT.is256BitVector() && Subtarget.hasInt256());
21024 bool AShift = LShift && (Subtarget.hasVLX() ||
21025 (VT != MVT::v2i64 && VT != MVT::v4i64));
21026 return (Opcode == ISD::SRA) ? AShift : LShift;
21029 // The shift amount is a variable, but it is the same for all vector lanes.
21030 // These instructions are defined together with shift-immediate.
21032 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21034 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21037 // Return true if the required (according to Opcode) variable-shift form is
21038 // natively supported by the Subtarget
21039 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21042 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21045 // vXi16 supported only on AVX-512, BWI
21046 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21049 if (VT.is512BitVector() || Subtarget.hasVLX())
21052 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21053 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21054 return (Opcode == ISD::SRA) ? AShift : LShift;
21057 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21058 const X86Subtarget &Subtarget) {
21059 MVT VT = Op.getSimpleValueType();
21061 SDValue R = Op.getOperand(0);
21062 SDValue Amt = Op.getOperand(1);
21064 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21065 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21067 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21068 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21069 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21070 SDValue Ex = DAG.getBitcast(ExVT, R);
21072 if (ShiftAmt >= 32) {
21073 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21075 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21076 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21077 ShiftAmt - 32, DAG);
21078 if (VT == MVT::v2i64)
21079 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21080 if (VT == MVT::v4i64)
21081 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21082 {9, 1, 11, 3, 13, 5, 15, 7});
21084 // SRA upper i32, SHL whole i64 and select lower i32.
21085 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21088 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21089 Lower = DAG.getBitcast(ExVT, Lower);
21090 if (VT == MVT::v2i64)
21091 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21092 if (VT == MVT::v4i64)
21093 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21094 {8, 1, 10, 3, 12, 5, 14, 7});
21096 return DAG.getBitcast(VT, Ex);
21099 // Optimize shl/srl/sra with constant shift amount.
21100 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21101 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21102 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21104 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21105 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21107 // i64 SRA needs to be performed as partial shifts.
21108 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21109 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21110 return ArithmeticShiftRight64(ShiftAmt);
21112 if (VT == MVT::v16i8 ||
21113 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21114 VT == MVT::v64i8) {
21115 unsigned NumElts = VT.getVectorNumElements();
21116 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21118 // Simple i8 add case
21119 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21120 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21122 // ashr(R, 7) === cmp_slt(R, 0)
21123 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21124 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21125 if (VT.is512BitVector()) {
21126 assert(VT == MVT::v64i8 && "Unexpected element type!");
21127 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21128 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21130 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21133 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21134 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21137 if (Op.getOpcode() == ISD::SHL) {
21138 // Make a large shift.
21139 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21141 SHL = DAG.getBitcast(VT, SHL);
21142 // Zero out the rightmost bits.
21143 return DAG.getNode(ISD::AND, dl, VT, SHL,
21144 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21146 if (Op.getOpcode() == ISD::SRL) {
21147 // Make a large shift.
21148 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21150 SRL = DAG.getBitcast(VT, SRL);
21151 // Zero out the leftmost bits.
21152 return DAG.getNode(ISD::AND, dl, VT, SRL,
21153 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21155 if (Op.getOpcode() == ISD::SRA) {
21156 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21157 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21159 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21160 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21161 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21164 llvm_unreachable("Unknown shift opcode.");
21169 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21170 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21171 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21172 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21174 // Peek through any splat that was introduced for i64 shift vectorization.
21175 int SplatIndex = -1;
21176 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21177 if (SVN->isSplat()) {
21178 SplatIndex = SVN->getSplatIndex();
21179 Amt = Amt.getOperand(0);
21180 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21181 "Splat shuffle referencing second operand");
21184 if (Amt.getOpcode() != ISD::BITCAST ||
21185 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21188 Amt = Amt.getOperand(0);
21189 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21190 VT.getVectorNumElements();
21191 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21192 uint64_t ShiftAmt = 0;
21193 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21194 for (unsigned i = 0; i != Ratio; ++i) {
21195 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21199 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21202 // Check remaining shift amounts (if not a splat).
21203 if (SplatIndex < 0) {
21204 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21205 uint64_t ShAmt = 0;
21206 for (unsigned j = 0; j != Ratio; ++j) {
21207 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21211 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21213 if (ShAmt != ShiftAmt)
21218 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21219 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21221 if (Op.getOpcode() == ISD::SRA)
21222 return ArithmeticShiftRight64(ShiftAmt);
21228 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21229 const X86Subtarget &Subtarget) {
21230 MVT VT = Op.getSimpleValueType();
21232 SDValue R = Op.getOperand(0);
21233 SDValue Amt = Op.getOperand(1);
21235 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21236 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21238 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21239 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21241 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21243 MVT EltVT = VT.getVectorElementType();
21245 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21246 // Check if this build_vector node is doing a splat.
21247 // If so, then set BaseShAmt equal to the splat value.
21248 BaseShAmt = BV->getSplatValue();
21249 if (BaseShAmt && BaseShAmt.isUndef())
21250 BaseShAmt = SDValue();
21252 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21253 Amt = Amt.getOperand(0);
21255 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21256 if (SVN && SVN->isSplat()) {
21257 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21258 SDValue InVec = Amt.getOperand(0);
21259 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21260 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21261 "Unexpected shuffle index found!");
21262 BaseShAmt = InVec.getOperand(SplatIdx);
21263 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21264 if (ConstantSDNode *C =
21265 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21266 if (C->getZExtValue() == SplatIdx)
21267 BaseShAmt = InVec.getOperand(1);
21272 // Avoid introducing an extract element from a shuffle.
21273 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21274 DAG.getIntPtrConstant(SplatIdx, dl));
21278 if (BaseShAmt.getNode()) {
21279 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
21280 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
21281 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
21282 else if (EltVT.bitsLT(MVT::i32))
21283 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
21285 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
21289 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21290 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
21291 Amt.getOpcode() == ISD::BITCAST &&
21292 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
21293 Amt = Amt.getOperand(0);
21294 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21295 VT.getVectorNumElements();
21296 std::vector<SDValue> Vals(Ratio);
21297 for (unsigned i = 0; i != Ratio; ++i)
21298 Vals[i] = Amt.getOperand(i);
21299 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21300 for (unsigned j = 0; j != Ratio; ++j)
21301 if (Vals[j] != Amt.getOperand(i + j))
21305 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
21306 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
21311 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
21312 SelectionDAG &DAG) {
21313 MVT VT = Op.getSimpleValueType();
21315 SDValue R = Op.getOperand(0);
21316 SDValue Amt = Op.getOperand(1);
21317 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21319 assert(VT.isVector() && "Custom lowering only for vector shifts!");
21320 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
21322 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
21325 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
21328 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
21331 // XOP has 128-bit variable logical/arithmetic shifts.
21332 // +ve/-ve Amt = shift left/right.
21333 if (Subtarget.hasXOP() &&
21334 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
21335 VT == MVT::v8i16 || VT == MVT::v16i8)) {
21336 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
21337 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21338 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
21340 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
21341 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
21342 if (Op.getOpcode() == ISD::SRA)
21343 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
21346 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
21347 // shifts per-lane and then shuffle the partial results back together.
21348 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
21349 // Splat the shift amounts so the scalar shifts above will catch it.
21350 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
21351 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
21352 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
21353 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
21354 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
21357 // i64 vector arithmetic shift can be emulated with the transform:
21358 // M = lshr(SIGN_BIT, Amt)
21359 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
21360 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
21361 Op.getOpcode() == ISD::SRA) {
21362 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
21363 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
21364 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21365 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
21366 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
21370 // If possible, lower this packed shift into a vector multiply instead of
21371 // expanding it into a sequence of scalar shifts.
21372 // Do this only if the vector shift count is a constant build_vector.
21373 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
21374 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
21375 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
21376 SmallVector<SDValue, 8> Elts;
21377 MVT SVT = VT.getVectorElementType();
21378 unsigned SVTBits = SVT.getSizeInBits();
21379 APInt One(SVTBits, 1);
21380 unsigned NumElems = VT.getVectorNumElements();
21382 for (unsigned i=0; i !=NumElems; ++i) {
21383 SDValue Op = Amt->getOperand(i);
21384 if (Op->isUndef()) {
21385 Elts.push_back(Op);
21389 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
21390 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
21391 uint64_t ShAmt = C.getZExtValue();
21392 if (ShAmt >= SVTBits) {
21393 Elts.push_back(DAG.getUNDEF(SVT));
21396 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
21398 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
21399 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
21402 // Lower SHL with variable shift amount.
21403 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
21404 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
21406 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
21407 DAG.getConstant(0x3f800000U, dl, VT));
21408 Op = DAG.getBitcast(MVT::v4f32, Op);
21409 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
21410 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
21413 // If possible, lower this shift as a sequence of two shifts by
21414 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
21416 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
21418 // Could be rewritten as:
21419 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
21421 // The advantage is that the two shifts from the example would be
21422 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
21423 // the vector shift into four scalar shifts plus four pairs of vector
21425 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
21426 unsigned TargetOpcode = X86ISD::MOVSS;
21427 bool CanBeSimplified;
21428 // The splat value for the first packed shift (the 'X' from the example).
21429 SDValue Amt1 = Amt->getOperand(0);
21430 // The splat value for the second packed shift (the 'Y' from the example).
21431 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
21433 // See if it is possible to replace this node with a sequence of
21434 // two shifts followed by a MOVSS/MOVSD/PBLEND.
21435 if (VT == MVT::v4i32) {
21436 // Check if it is legal to use a MOVSS.
21437 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
21438 Amt2 == Amt->getOperand(3);
21439 if (!CanBeSimplified) {
21440 // Otherwise, check if we can still simplify this node using a MOVSD.
21441 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
21442 Amt->getOperand(2) == Amt->getOperand(3);
21443 TargetOpcode = X86ISD::MOVSD;
21444 Amt2 = Amt->getOperand(2);
21447 // Do similar checks for the case where the machine value type
21449 CanBeSimplified = Amt1 == Amt->getOperand(1);
21450 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
21451 CanBeSimplified = Amt2 == Amt->getOperand(i);
21453 if (!CanBeSimplified) {
21454 TargetOpcode = X86ISD::MOVSD;
21455 CanBeSimplified = true;
21456 Amt2 = Amt->getOperand(4);
21457 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
21458 CanBeSimplified = Amt1 == Amt->getOperand(i);
21459 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
21460 CanBeSimplified = Amt2 == Amt->getOperand(j);
21464 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
21465 isa<ConstantSDNode>(Amt2)) {
21466 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
21467 MVT CastVT = MVT::v4i32;
21469 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
21470 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
21472 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
21473 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
21474 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
21475 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
21476 if (TargetOpcode == X86ISD::MOVSD)
21477 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21478 BitCast2, {0, 1, 6, 7}));
21479 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21480 BitCast2, {0, 5, 6, 7}));
21484 // v4i32 Non Uniform Shifts.
21485 // If the shift amount is constant we can shift each lane using the SSE2
21486 // immediate shifts, else we need to zero-extend each lane to the lower i64
21487 // and shift using the SSE2 variable shifts.
21488 // The separate results can then be blended together.
21489 if (VT == MVT::v4i32) {
21490 unsigned Opc = Op.getOpcode();
21491 SDValue Amt0, Amt1, Amt2, Amt3;
21493 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
21494 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
21495 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
21496 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
21498 // ISD::SHL is handled above but we include it here for completeness.
21501 llvm_unreachable("Unknown target vector shift node");
21503 Opc = X86ISD::VSHL;
21506 Opc = X86ISD::VSRL;
21509 Opc = X86ISD::VSRA;
21512 // The SSE2 shifts use the lower i64 as the same shift amount for
21513 // all lanes and the upper i64 is ignored. These shuffle masks
21514 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
21515 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21516 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
21517 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
21518 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
21519 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
21522 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
21523 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
21524 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
21525 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
21526 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
21527 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
21528 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
21531 if (VT == MVT::v16i8 ||
21532 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
21533 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
21534 unsigned ShiftOpcode = Op->getOpcode();
21536 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
21537 // On SSE41 targets we make use of the fact that VSELECT lowers
21538 // to PBLENDVB which selects bytes based just on the sign bit.
21539 if (Subtarget.hasSSE41()) {
21540 V0 = DAG.getBitcast(VT, V0);
21541 V1 = DAG.getBitcast(VT, V1);
21542 Sel = DAG.getBitcast(VT, Sel);
21543 return DAG.getBitcast(SelVT,
21544 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
21546 // On pre-SSE41 targets we test for the sign bit by comparing to
21547 // zero - a negative value will set all bits of the lanes to true
21548 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
21549 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
21550 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
21551 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
21554 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
21555 // We can safely do this using i16 shifts as we're only interested in
21556 // the 3 lower bits of each byte.
21557 Amt = DAG.getBitcast(ExtVT, Amt);
21558 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
21559 Amt = DAG.getBitcast(VT, Amt);
21561 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
21562 // r = VSELECT(r, shift(r, 4), a);
21564 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21565 R = SignBitSelect(VT, Amt, M, R);
21568 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21570 // r = VSELECT(r, shift(r, 2), a);
21571 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21572 R = SignBitSelect(VT, Amt, M, R);
21575 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21577 // return VSELECT(r, shift(r, 1), a);
21578 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21579 R = SignBitSelect(VT, Amt, M, R);
21583 if (Op->getOpcode() == ISD::SRA) {
21584 // For SRA we need to unpack each byte to the higher byte of a i16 vector
21585 // so we can correctly sign extend. We don't care what happens to the
21587 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
21588 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
21589 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
21590 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
21591 ALo = DAG.getBitcast(ExtVT, ALo);
21592 AHi = DAG.getBitcast(ExtVT, AHi);
21593 RLo = DAG.getBitcast(ExtVT, RLo);
21594 RHi = DAG.getBitcast(ExtVT, RHi);
21596 // r = VSELECT(r, shift(r, 4), a);
21597 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21598 DAG.getConstant(4, dl, ExtVT));
21599 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21600 DAG.getConstant(4, dl, ExtVT));
21601 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21602 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21605 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21606 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21608 // r = VSELECT(r, shift(r, 2), a);
21609 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21610 DAG.getConstant(2, dl, ExtVT));
21611 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21612 DAG.getConstant(2, dl, ExtVT));
21613 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21614 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21617 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21618 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21620 // r = VSELECT(r, shift(r, 1), a);
21621 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21622 DAG.getConstant(1, dl, ExtVT));
21623 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21624 DAG.getConstant(1, dl, ExtVT));
21625 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21626 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21628 // Logical shift the result back to the lower byte, leaving a zero upper
21630 // meaning that we can safely pack with PACKUSWB.
21632 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
21634 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
21635 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21639 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
21640 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
21641 // solution better.
21642 if (Subtarget.hasInt256() && VT == MVT::v8i16) {
21643 MVT ExtVT = MVT::v8i32;
21645 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21646 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
21647 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
21648 return DAG.getNode(ISD::TRUNCATE, dl, VT,
21649 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
21652 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
21653 MVT ExtVT = MVT::v8i32;
21654 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21655 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
21656 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
21657 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
21658 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
21659 ALo = DAG.getBitcast(ExtVT, ALo);
21660 AHi = DAG.getBitcast(ExtVT, AHi);
21661 RLo = DAG.getBitcast(ExtVT, RLo);
21662 RHi = DAG.getBitcast(ExtVT, RHi);
21663 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
21664 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
21665 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
21666 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
21667 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21670 if (VT == MVT::v8i16) {
21671 unsigned ShiftOpcode = Op->getOpcode();
21673 // If we have a constant shift amount, the non-SSE41 path is best as
21674 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
21675 bool UseSSE41 = Subtarget.hasSSE41() &&
21676 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21678 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
21679 // On SSE41 targets we make use of the fact that VSELECT lowers
21680 // to PBLENDVB which selects bytes based just on the sign bit.
21682 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
21683 V0 = DAG.getBitcast(ExtVT, V0);
21684 V1 = DAG.getBitcast(ExtVT, V1);
21685 Sel = DAG.getBitcast(ExtVT, Sel);
21686 return DAG.getBitcast(
21687 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
21689 // On pre-SSE41 targets we splat the sign bit - a negative value will
21690 // set all bits of the lanes to true and VSELECT uses that in
21691 // its OR(AND(V0,C),AND(V1,~C)) lowering.
21693 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
21694 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
21697 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
21699 // On SSE41 targets we need to replicate the shift mask in both
21700 // bytes for PBLENDVB.
21703 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
21704 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
21706 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
21709 // r = VSELECT(r, shift(r, 8), a);
21710 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
21711 R = SignBitSelect(Amt, M, R);
21714 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21716 // r = VSELECT(r, shift(r, 4), a);
21717 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21718 R = SignBitSelect(Amt, M, R);
21721 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21723 // r = VSELECT(r, shift(r, 2), a);
21724 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21725 R = SignBitSelect(Amt, M, R);
21728 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21730 // return VSELECT(r, shift(r, 1), a);
21731 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21732 R = SignBitSelect(Amt, M, R);
21736 // Decompose 256-bit shifts into smaller 128-bit shifts.
21737 if (VT.is256BitVector())
21738 return Lower256IntArith(Op, DAG);
21743 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
21744 SelectionDAG &DAG) {
21745 MVT VT = Op.getSimpleValueType();
21747 SDValue R = Op.getOperand(0);
21748 SDValue Amt = Op.getOperand(1);
21750 assert(VT.isVector() && "Custom lowering only for vector rotates!");
21751 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
21752 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
21754 // XOP has 128-bit vector variable + immediate rotates.
21755 // +ve/-ve Amt = rotate left/right.
21757 // Split 256-bit integers.
21758 if (VT.is256BitVector())
21759 return Lower256IntArith(Op, DAG);
21761 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
21763 // Attempt to rotate by immediate.
21764 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21765 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
21766 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
21767 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
21768 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
21769 DAG.getConstant(RotateAmt, DL, MVT::i8));
21773 // Use general rotate by variable (per-element).
21774 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
21777 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
21778 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
21779 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
21780 // looks for this combo and may remove the "setcc" instruction if the "setcc"
21781 // has only one use.
21782 SDNode *N = Op.getNode();
21783 SDValue LHS = N->getOperand(0);
21784 SDValue RHS = N->getOperand(1);
21785 unsigned BaseOp = 0;
21786 X86::CondCode Cond;
21788 switch (Op.getOpcode()) {
21789 default: llvm_unreachable("Unknown ovf instruction!");
21791 // A subtract of one will be selected as a INC. Note that INC doesn't
21792 // set CF, so we can't do this for UADDO.
21793 if (isOneConstant(RHS)) {
21794 BaseOp = X86ISD::INC;
21795 Cond = X86::COND_O;
21798 BaseOp = X86ISD::ADD;
21799 Cond = X86::COND_O;
21802 BaseOp = X86ISD::ADD;
21803 Cond = X86::COND_B;
21806 // A subtract of one will be selected as a DEC. Note that DEC doesn't
21807 // set CF, so we can't do this for USUBO.
21808 if (isOneConstant(RHS)) {
21809 BaseOp = X86ISD::DEC;
21810 Cond = X86::COND_O;
21813 BaseOp = X86ISD::SUB;
21814 Cond = X86::COND_O;
21817 BaseOp = X86ISD::SUB;
21818 Cond = X86::COND_B;
21821 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
21822 Cond = X86::COND_O;
21824 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
21825 if (N->getValueType(0) == MVT::i8) {
21826 BaseOp = X86ISD::UMUL8;
21827 Cond = X86::COND_O;
21830 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
21832 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
21834 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
21836 if (N->getValueType(1) == MVT::i1)
21837 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21839 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21843 // Also sets EFLAGS.
21844 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
21845 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
21847 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
21849 if (N->getValueType(1) == MVT::i1)
21850 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21852 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21855 /// Returns true if the operand type is exactly twice the native width, and
21856 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
21857 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
21858 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
21859 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
21860 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
21863 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
21864 else if (OpWidth == 128)
21865 return Subtarget.hasCmpxchg16b();
21870 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21871 return needsCmpXchgNb(SI->getValueOperand()->getType());
21874 // Note: this turns large loads into lock cmpxchg8b/16b.
21875 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
21876 TargetLowering::AtomicExpansionKind
21877 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21878 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
21879 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
21880 : AtomicExpansionKind::None;
21883 TargetLowering::AtomicExpansionKind
21884 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21885 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21886 Type *MemType = AI->getType();
21888 // If the operand is too big, we must see if cmpxchg8/16b is available
21889 // and default to library calls otherwise.
21890 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
21891 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
21892 : AtomicExpansionKind::None;
21895 AtomicRMWInst::BinOp Op = AI->getOperation();
21898 llvm_unreachable("Unknown atomic operation");
21899 case AtomicRMWInst::Xchg:
21900 case AtomicRMWInst::Add:
21901 case AtomicRMWInst::Sub:
21902 // It's better to use xadd, xsub or xchg for these in all cases.
21903 return AtomicExpansionKind::None;
21904 case AtomicRMWInst::Or:
21905 case AtomicRMWInst::And:
21906 case AtomicRMWInst::Xor:
21907 // If the atomicrmw's result isn't actually used, we can just add a "lock"
21908 // prefix to a normal instruction for these operations.
21909 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
21910 : AtomicExpansionKind::None;
21911 case AtomicRMWInst::Nand:
21912 case AtomicRMWInst::Max:
21913 case AtomicRMWInst::Min:
21914 case AtomicRMWInst::UMax:
21915 case AtomicRMWInst::UMin:
21916 // These always require a non-trivial set of data operations on x86. We must
21917 // use a cmpxchg loop.
21918 return AtomicExpansionKind::CmpXChg;
21923 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
21924 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21925 Type *MemType = AI->getType();
21926 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
21927 // there is no benefit in turning such RMWs into loads, and it is actually
21928 // harmful as it introduces a mfence.
21929 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
21932 auto Builder = IRBuilder<>(AI);
21933 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21934 auto SynchScope = AI->getSynchScope();
21935 // We must restrict the ordering to avoid generating loads with Release or
21936 // ReleaseAcquire orderings.
21937 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
21938 auto Ptr = AI->getPointerOperand();
21940 // Before the load we need a fence. Here is an example lifted from
21941 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
21944 // x.store(1, relaxed);
21945 // r1 = y.fetch_add(0, release);
21947 // y.fetch_add(42, acquire);
21948 // r2 = x.load(relaxed);
21949 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
21950 // lowered to just a load without a fence. A mfence flushes the store buffer,
21951 // making the optimization clearly correct.
21952 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
21953 // otherwise, we might be able to be more aggressive on relaxed idempotent
21954 // rmw. In practice, they do not look useful, so we don't try to be
21955 // especially clever.
21956 if (SynchScope == SingleThread)
21957 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
21958 // the IR level, so we must wrap it in an intrinsic.
21961 if (!Subtarget.hasMFence())
21962 // FIXME: it might make sense to use a locked operation here but on a
21963 // different cache-line to prevent cache-line bouncing. In practice it
21964 // is probably a small win, and x86 processors without mfence are rare
21965 // enough that we do not bother.
21969 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
21970 Builder.CreateCall(MFence, {});
21972 // Finally we can emit the atomic load.
21973 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
21974 AI->getType()->getPrimitiveSizeInBits());
21975 Loaded->setAtomic(Order, SynchScope);
21976 AI->replaceAllUsesWith(Loaded);
21977 AI->eraseFromParent();
21981 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
21982 SelectionDAG &DAG) {
21984 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
21985 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
21986 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
21987 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
21989 // The only fence that needs an instruction is a sequentially-consistent
21990 // cross-thread fence.
21991 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
21992 FenceScope == CrossThread) {
21993 if (Subtarget.hasMFence())
21994 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
21996 SDValue Chain = Op.getOperand(0);
21997 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
21999 DAG.getRegister(X86::ESP, MVT::i32), // Base
22000 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22001 DAG.getRegister(0, MVT::i32), // Index
22002 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22003 DAG.getRegister(0, MVT::i32), // Segment.
22007 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22008 return SDValue(Res, 0);
22011 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22012 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22015 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22016 SelectionDAG &DAG) {
22017 MVT T = Op.getSimpleValueType();
22021 switch(T.SimpleTy) {
22022 default: llvm_unreachable("Invalid value type!");
22023 case MVT::i8: Reg = X86::AL; size = 1; break;
22024 case MVT::i16: Reg = X86::AX; size = 2; break;
22025 case MVT::i32: Reg = X86::EAX; size = 4; break;
22027 assert(Subtarget.is64Bit() && "Node not type legal!");
22028 Reg = X86::RAX; size = 8;
22031 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22032 Op.getOperand(2), SDValue());
22033 SDValue Ops[] = { cpIn.getValue(0),
22036 DAG.getTargetConstant(size, DL, MVT::i8),
22037 cpIn.getValue(1) };
22038 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22039 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22040 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22044 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22045 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22046 MVT::i32, cpOut.getValue(2));
22047 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22049 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22050 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22051 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22055 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22056 SelectionDAG &DAG) {
22057 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22058 MVT DstVT = Op.getSimpleValueType();
22060 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22061 SrcVT == MVT::i64) {
22062 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22063 if (DstVT != MVT::f64)
22064 // This conversion needs to be expanded.
22067 SDValue Op0 = Op->getOperand(0);
22068 SmallVector<SDValue, 16> Elts;
22072 if (SrcVT.isVector()) {
22073 NumElts = SrcVT.getVectorNumElements();
22074 SVT = SrcVT.getVectorElementType();
22076 // Widen the vector in input in the case of MVT::v2i32.
22077 // Example: from MVT::v2i32 to MVT::v4i32.
22078 for (unsigned i = 0, e = NumElts; i != e; ++i)
22079 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22080 DAG.getIntPtrConstant(i, dl)));
22082 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22083 "Unexpected source type in LowerBITCAST");
22084 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22085 DAG.getIntPtrConstant(0, dl)));
22086 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22087 DAG.getIntPtrConstant(1, dl)));
22091 // Explicitly mark the extra elements as Undef.
22092 Elts.append(NumElts, DAG.getUNDEF(SVT));
22094 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22095 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22096 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22097 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22098 DAG.getIntPtrConstant(0, dl));
22101 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22102 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22103 assert((DstVT == MVT::i64 ||
22104 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22105 "Unexpected custom BITCAST");
22106 // i64 <=> MMX conversions are Legal.
22107 if (SrcVT==MVT::i64 && DstVT.isVector())
22109 if (DstVT==MVT::i64 && SrcVT.isVector())
22111 // MMX <=> MMX conversions are Legal.
22112 if (SrcVT.isVector() && DstVT.isVector())
22114 // All other conversions need to be expanded.
22118 /// Compute the horizontal sum of bytes in V for the elements of VT.
22120 /// Requires V to be a byte vector and VT to be an integer vector type with
22121 /// wider elements than V's type. The width of the elements of VT determines
22122 /// how many bytes of V are summed horizontally to produce each element of the
22124 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22125 const X86Subtarget &Subtarget,
22126 SelectionDAG &DAG) {
22128 MVT ByteVecVT = V.getSimpleValueType();
22129 MVT EltVT = VT.getVectorElementType();
22130 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22131 "Expected value to have byte element type.");
22132 assert(EltVT != MVT::i8 &&
22133 "Horizontal byte sum only makes sense for wider elements!");
22134 unsigned VecSize = VT.getSizeInBits();
22135 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22137 // PSADBW instruction horizontally add all bytes and leave the result in i64
22138 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22139 if (EltVT == MVT::i64) {
22140 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22141 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22142 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22143 return DAG.getBitcast(VT, V);
22146 if (EltVT == MVT::i32) {
22147 // We unpack the low half and high half into i32s interleaved with zeros so
22148 // that we can use PSADBW to horizontally sum them. The most useful part of
22149 // this is that it lines up the results of two PSADBW instructions to be
22150 // two v2i64 vectors which concatenated are the 4 population counts. We can
22151 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22152 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22153 SDValue V32 = DAG.getBitcast(VT, V);
22154 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22155 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22157 // Do the horizontal sums into two v2i64s.
22158 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22159 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22160 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22161 DAG.getBitcast(ByteVecVT, Low), Zeros);
22162 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22163 DAG.getBitcast(ByteVecVT, High), Zeros);
22165 // Merge them together.
22166 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22167 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22168 DAG.getBitcast(ShortVecVT, Low),
22169 DAG.getBitcast(ShortVecVT, High));
22171 return DAG.getBitcast(VT, V);
22174 // The only element type left is i16.
22175 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22177 // To obtain pop count for each i16 element starting from the pop count for
22178 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22179 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22180 // directly supported.
22181 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22182 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22183 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22184 DAG.getBitcast(ByteVecVT, V));
22185 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22188 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22189 const X86Subtarget &Subtarget,
22190 SelectionDAG &DAG) {
22191 MVT VT = Op.getSimpleValueType();
22192 MVT EltVT = VT.getVectorElementType();
22193 unsigned VecSize = VT.getSizeInBits();
22195 // Implement a lookup table in register by using an algorithm based on:
22196 // http://wm.ite.pl/articles/sse-popcount.html
22198 // The general idea is that every lower byte nibble in the input vector is an
22199 // index into a in-register pre-computed pop count table. We then split up the
22200 // input vector in two new ones: (1) a vector with only the shifted-right
22201 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22202 // masked out higher ones) for each byte. PSHUB is used separately with both
22203 // to index the in-register table. Next, both are added and the result is a
22204 // i8 vector where each element contains the pop count for input byte.
22206 // To obtain the pop count for elements != i8, we follow up with the same
22207 // approach and use additional tricks as described below.
22209 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22210 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22211 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22212 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22214 int NumByteElts = VecSize / 8;
22215 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22216 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22217 SmallVector<SDValue, 64> LUTVec;
22218 for (int i = 0; i < NumByteElts; ++i)
22219 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22220 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22221 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22224 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22225 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22228 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22230 // The input vector is used as the shuffle mask that index elements into the
22231 // LUT. After counting low and high nibbles, add the vector to obtain the
22232 // final pop count per i8 element.
22233 SDValue HighPopCnt =
22234 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22235 SDValue LowPopCnt =
22236 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22237 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22239 if (EltVT == MVT::i8)
22242 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22245 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22246 const X86Subtarget &Subtarget,
22247 SelectionDAG &DAG) {
22248 MVT VT = Op.getSimpleValueType();
22249 assert(VT.is128BitVector() &&
22250 "Only 128-bit vector bitmath lowering supported.");
22252 int VecSize = VT.getSizeInBits();
22253 MVT EltVT = VT.getVectorElementType();
22254 int Len = EltVT.getSizeInBits();
22256 // This is the vectorized version of the "best" algorithm from
22257 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22258 // with a minor tweak to use a series of adds + shifts instead of vector
22259 // multiplications. Implemented for all integer vector types. We only use
22260 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22261 // much faster, even faster than using native popcnt instructions.
22263 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22264 MVT VT = V.getSimpleValueType();
22265 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22266 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
22268 auto GetMask = [&](SDValue V, APInt Mask) {
22269 MVT VT = V.getSimpleValueType();
22270 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
22271 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
22274 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
22275 // x86, so set the SRL type to have elements at least i16 wide. This is
22276 // correct because all of our SRLs are followed immediately by a mask anyways
22277 // that handles any bits that sneak into the high bits of the byte elements.
22278 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
22282 // v = v - ((v >> 1) & 0x55555555...)
22284 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
22285 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
22286 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
22288 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
22289 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
22290 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
22291 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
22292 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
22294 // v = (v + (v >> 4)) & 0x0F0F0F0F...
22295 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
22296 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
22297 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
22299 // At this point, V contains the byte-wise population count, and we are
22300 // merely doing a horizontal sum if necessary to get the wider element
22302 if (EltVT == MVT::i8)
22305 return LowerHorizontalByteSum(
22306 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
22310 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
22311 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
22312 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22313 SelectionDAG &DAG) {
22314 MVT VT = Op.getSimpleValueType();
22315 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
22316 "Unknown CTPOP type to handle");
22317 SDLoc DL(Op.getNode());
22318 SDValue Op0 = Op.getOperand(0);
22320 if (!Subtarget.hasSSSE3()) {
22321 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
22322 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
22323 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
22326 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22327 unsigned NumElems = VT.getVectorNumElements();
22329 // Extract each 128-bit vector, compute pop count and concat the result.
22330 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
22331 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
22333 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22334 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22335 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22338 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
22339 unsigned NumElems = VT.getVectorNumElements();
22341 // Extract each 256-bit vector, compute pop count and concat the result.
22342 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
22343 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
22345 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22346 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22347 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22350 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
22353 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22354 SelectionDAG &DAG) {
22355 assert(Op.getSimpleValueType().isVector() &&
22356 "We only do custom lowering for vector population count.");
22357 return LowerVectorCTPOP(Op, Subtarget, DAG);
22360 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
22361 MVT VT = Op.getSimpleValueType();
22362 SDValue In = Op.getOperand(0);
22365 // For scalars, its still beneficial to transfer to/from the SIMD unit to
22366 // perform the BITREVERSE.
22367 if (!VT.isVector()) {
22368 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
22369 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
22370 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
22371 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
22372 DAG.getIntPtrConstant(0, DL));
22375 MVT SVT = VT.getVectorElementType();
22376 int NumElts = VT.getVectorNumElements();
22377 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
22379 // Decompose 256-bit ops into smaller 128-bit ops.
22380 if (VT.is256BitVector()) {
22381 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22382 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22384 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
22385 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22386 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
22387 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
22390 assert(VT.is128BitVector() &&
22391 "Only 128-bit vector bitreverse lowering supported.");
22393 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
22394 // perform the BSWAP in the shuffle.
22395 // Its best to shuffle using the second operand as this will implicitly allow
22396 // memory folding for multiple vectors.
22397 SmallVector<SDValue, 16> MaskElts;
22398 for (int i = 0; i != NumElts; ++i) {
22399 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
22400 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
22401 int PermuteByte = SourceByte | (2 << 5);
22402 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
22406 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
22407 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
22408 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
22410 return DAG.getBitcast(VT, Res);
22413 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
22414 SelectionDAG &DAG) {
22415 if (Subtarget.hasXOP())
22416 return LowerBITREVERSE_XOP(Op, DAG);
22418 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
22420 MVT VT = Op.getSimpleValueType();
22421 SDValue In = Op.getOperand(0);
22424 unsigned NumElts = VT.getVectorNumElements();
22425 assert(VT.getScalarType() == MVT::i8 &&
22426 "Only byte vector BITREVERSE supported");
22428 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
22429 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22430 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
22431 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22432 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22433 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
22434 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
22435 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22438 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
22439 // two nibbles and a PSHUFB lookup to find the bitreverse of each
22440 // 0-15 value (moved to the other nibble).
22441 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
22442 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
22443 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
22445 const int LoLUT[16] = {
22446 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
22447 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
22448 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
22449 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
22450 const int HiLUT[16] = {
22451 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
22452 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
22453 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
22454 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
22456 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
22457 for (unsigned i = 0; i < NumElts; ++i) {
22458 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
22459 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
22462 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
22463 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
22464 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
22465 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
22466 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
22469 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
22470 unsigned NewOpc = 0;
22471 switch (N->getOpcode()) {
22472 case ISD::ATOMIC_LOAD_ADD:
22473 NewOpc = X86ISD::LADD;
22475 case ISD::ATOMIC_LOAD_SUB:
22476 NewOpc = X86ISD::LSUB;
22478 case ISD::ATOMIC_LOAD_OR:
22479 NewOpc = X86ISD::LOR;
22481 case ISD::ATOMIC_LOAD_XOR:
22482 NewOpc = X86ISD::LXOR;
22484 case ISD::ATOMIC_LOAD_AND:
22485 NewOpc = X86ISD::LAND;
22488 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
22491 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
22492 return DAG.getMemIntrinsicNode(
22493 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
22494 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
22495 /*MemVT=*/N->getSimpleValueType(0), MMO);
22498 /// Lower atomic_load_ops into LOCK-prefixed operations.
22499 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
22500 const X86Subtarget &Subtarget) {
22501 SDValue Chain = N->getOperand(0);
22502 SDValue LHS = N->getOperand(1);
22503 SDValue RHS = N->getOperand(2);
22504 unsigned Opc = N->getOpcode();
22505 MVT VT = N->getSimpleValueType(0);
22508 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
22509 // can only be lowered when the result is unused. They should have already
22510 // been transformed into a cmpxchg loop in AtomicExpand.
22511 if (N->hasAnyUseOfValue(0)) {
22512 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
22513 // select LXADD if LOCK_SUB can't be selected.
22514 if (Opc == ISD::ATOMIC_LOAD_SUB) {
22515 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
22516 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
22517 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
22518 RHS, AN->getMemOperand());
22520 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
22521 "Used AtomicRMW ops other than Add should have been expanded!");
22525 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
22526 // RAUW the chain, but don't worry about the result, as it's unused.
22527 assert(!N->hasAnyUseOfValue(0));
22528 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
22532 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
22533 SDNode *Node = Op.getNode();
22535 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
22537 // Convert seq_cst store -> xchg
22538 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
22539 // FIXME: On 32-bit, store -> fist or movq would be more efficient
22540 // (The only way to get a 16-byte store is cmpxchg16b)
22541 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
22542 if (cast<AtomicSDNode>(Node)->getOrdering() ==
22543 AtomicOrdering::SequentiallyConsistent ||
22544 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
22545 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
22546 cast<AtomicSDNode>(Node)->getMemoryVT(),
22547 Node->getOperand(0),
22548 Node->getOperand(1), Node->getOperand(2),
22549 cast<AtomicSDNode>(Node)->getMemOperand());
22550 return Swap.getValue(1);
22552 // Other atomic stores have a simple pattern.
22556 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
22557 MVT VT = Op.getNode()->getSimpleValueType(0);
22559 // Let legalize expand this if it isn't a legal type yet.
22560 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
22563 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22566 bool ExtraOp = false;
22567 switch (Op.getOpcode()) {
22568 default: llvm_unreachable("Invalid code");
22569 case ISD::ADDC: Opc = X86ISD::ADD; break;
22570 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
22571 case ISD::SUBC: Opc = X86ISD::SUB; break;
22572 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
22576 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22578 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22579 Op.getOperand(1), Op.getOperand(2));
22582 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
22583 SelectionDAG &DAG) {
22584 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
22586 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
22587 // which returns the values as { float, float } (in XMM0) or
22588 // { double, double } (which is returned in XMM0, XMM1).
22590 SDValue Arg = Op.getOperand(0);
22591 EVT ArgVT = Arg.getValueType();
22592 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22594 TargetLowering::ArgListTy Args;
22595 TargetLowering::ArgListEntry Entry;
22599 Entry.isSExt = false;
22600 Entry.isZExt = false;
22601 Args.push_back(Entry);
22603 bool isF64 = ArgVT == MVT::f64;
22604 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
22605 // the small struct {f32, f32} is returned in (eax, edx). For f64,
22606 // the results are returned via SRet in memory.
22607 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
22608 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22610 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
22612 Type *RetTy = isF64
22613 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
22614 : (Type*)VectorType::get(ArgTy, 4);
22616 TargetLowering::CallLoweringInfo CLI(DAG);
22617 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
22618 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
22620 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
22623 // Returned in xmm0 and xmm1.
22624 return CallResult.first;
22626 // Returned in bits 0:31 and 32:64 xmm0.
22627 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22628 CallResult.first, DAG.getIntPtrConstant(0, dl));
22629 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22630 CallResult.first, DAG.getIntPtrConstant(1, dl));
22631 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
22632 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
22635 /// Widen a vector input to a vector of NVT. The
22636 /// input vector must have the same element type as NVT.
22637 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
22638 bool FillWithZeroes = false) {
22639 // Check if InOp already has the right width.
22640 MVT InVT = InOp.getSimpleValueType();
22644 if (InOp.isUndef())
22645 return DAG.getUNDEF(NVT);
22647 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
22648 "input and widen element type must match");
22650 unsigned InNumElts = InVT.getVectorNumElements();
22651 unsigned WidenNumElts = NVT.getVectorNumElements();
22652 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
22653 "Unexpected request for vector widening");
22655 EVT EltVT = NVT.getVectorElementType();
22658 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
22659 InOp.getNumOperands() == 2) {
22660 SDValue N1 = InOp.getOperand(1);
22661 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
22663 InOp = InOp.getOperand(0);
22664 InVT = InOp.getSimpleValueType();
22665 InNumElts = InVT.getVectorNumElements();
22668 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
22669 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
22670 SmallVector<SDValue, 16> Ops;
22671 for (unsigned i = 0; i < InNumElts; ++i)
22672 Ops.push_back(InOp.getOperand(i));
22674 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
22675 DAG.getUNDEF(EltVT);
22676 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
22677 Ops.push_back(FillVal);
22678 return DAG.getBuildVector(NVT, dl, Ops);
22680 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
22682 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
22683 InOp, DAG.getIntPtrConstant(0, dl));
22686 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
22687 SelectionDAG &DAG) {
22688 assert(Subtarget.hasAVX512() &&
22689 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22691 // X86 scatter kills mask register, so its type should be added to
22692 // the list of return values.
22693 // If the "scatter" has 2 return values, it is already handled.
22694 if (Op.getNode()->getNumValues() == 2)
22697 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
22698 SDValue Src = N->getValue();
22699 MVT VT = Src.getSimpleValueType();
22700 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
22703 SDValue NewScatter;
22704 SDValue Index = N->getIndex();
22705 SDValue Mask = N->getMask();
22706 SDValue Chain = N->getChain();
22707 SDValue BasePtr = N->getBasePtr();
22708 MVT MemVT = N->getMemoryVT().getSimpleVT();
22709 MVT IndexVT = Index.getSimpleValueType();
22710 MVT MaskVT = Mask.getSimpleValueType();
22712 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
22713 // The v2i32 value was promoted to v2i64.
22714 // Now we "redo" the type legalizer's work and widen the original
22715 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
22717 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
22718 "Unexpected memory type");
22719 int ShuffleMask[] = {0, 2, -1, -1};
22720 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
22721 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
22722 // Now we have 4 elements instead of 2.
22723 // Expand the index.
22724 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
22725 Index = ExtendToType(Index, NewIndexVT, DAG);
22727 // Expand the mask with zeroes
22728 // Mask may be <2 x i64> or <2 x i1> at this moment
22729 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
22730 "Unexpected mask type");
22731 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
22732 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22736 unsigned NumElts = VT.getVectorNumElements();
22737 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22738 !Index.getSimpleValueType().is512BitVector()) {
22739 // AVX512F supports only 512-bit vectors. Or data or index should
22740 // be 512 bit wide. If now the both index and data are 256-bit, but
22741 // the vector contains 8 elements, we just sign-extend the index
22742 if (IndexVT == MVT::v8i32)
22743 // Just extend index
22744 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22746 // The minimal number of elts in scatter is 8
22749 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22750 // Use original index here, do not modify the index twice
22751 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
22752 if (IndexVT.getScalarType() == MVT::i32)
22753 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22756 // At this point we have promoted mask operand
22757 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22758 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22759 // Use the original mask here, do not modify the mask twice
22760 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
22762 // The value that should be stored
22763 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22764 Src = ExtendToType(Src, NewVT, DAG);
22767 // If the mask is "wide" at this point - truncate it to i1 vector
22768 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
22769 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
22771 // The mask is killed by scatter, add it to the values
22772 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
22773 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
22774 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
22775 N->getMemOperand());
22776 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
22777 return SDValue(NewScatter.getNode(), 1);
22780 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
22781 SelectionDAG &DAG) {
22783 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
22784 MVT VT = Op.getSimpleValueType();
22785 MVT ScalarVT = VT.getScalarType();
22786 SDValue Mask = N->getMask();
22789 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
22790 "Expanding masked load is supported on AVX-512 target only!");
22792 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
22793 "Expanding masked load is supported for 32 and 64-bit types only!");
22795 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
22796 // VLX. These types for exp-loads are handled here.
22797 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
22800 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22801 "Cannot lower masked load op.");
22803 assert((ScalarVT.getSizeInBits() >= 32 ||
22804 (Subtarget.hasBWI() &&
22805 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22806 "Unsupported masked load op.");
22808 // This operation is legal for targets with VLX, but without
22809 // VLX the vector should be widened to 512 bit
22810 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
22811 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22812 SDValue Src0 = N->getSrc0();
22813 Src0 = ExtendToType(Src0, WideDataVT, DAG);
22815 // Mask element has to be i1.
22816 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22817 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22818 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22820 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22822 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22823 if (MaskEltTy != MVT::i1)
22824 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22825 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22826 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
22827 N->getBasePtr(), Mask, Src0,
22828 N->getMemoryVT(), N->getMemOperand(),
22829 N->getExtensionType(),
22830 N->isExpandingLoad());
22832 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22833 NewLoad.getValue(0),
22834 DAG.getIntPtrConstant(0, dl));
22835 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
22836 return DAG.getMergeValues(RetOps, dl);
22839 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
22840 SelectionDAG &DAG) {
22841 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
22842 SDValue DataToStore = N->getValue();
22843 MVT VT = DataToStore.getSimpleValueType();
22844 MVT ScalarVT = VT.getScalarType();
22845 SDValue Mask = N->getMask();
22848 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
22849 "Expanding masked load is supported on AVX-512 target only!");
22851 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
22852 "Expanding masked load is supported for 32 and 64-bit types only!");
22854 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
22855 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
22858 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22859 "Cannot lower masked store op.");
22861 assert((ScalarVT.getSizeInBits() >= 32 ||
22862 (Subtarget.hasBWI() &&
22863 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22864 "Unsupported masked store op.");
22866 // This operation is legal for targets with VLX, but without
22867 // VLX the vector should be widened to 512 bit
22868 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
22869 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22871 // Mask element has to be i1.
22872 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22873 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22874 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22876 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22878 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
22879 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22880 if (MaskEltTy != MVT::i1)
22881 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22882 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22883 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
22884 Mask, N->getMemoryVT(), N->getMemOperand(),
22885 N->isTruncatingStore(), N->isCompressingStore());
22888 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
22889 SelectionDAG &DAG) {
22890 assert(Subtarget.hasAVX512() &&
22891 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22893 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
22895 MVT VT = Op.getSimpleValueType();
22896 SDValue Index = N->getIndex();
22897 SDValue Mask = N->getMask();
22898 SDValue Src0 = N->getValue();
22899 MVT IndexVT = Index.getSimpleValueType();
22900 MVT MaskVT = Mask.getSimpleValueType();
22902 unsigned NumElts = VT.getVectorNumElements();
22903 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
22905 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22906 !Index.getSimpleValueType().is512BitVector()) {
22907 // AVX512F supports only 512-bit vectors. Or data or index should
22908 // be 512 bit wide. If now the both index and data are 256-bit, but
22909 // the vector contains 8 elements, we just sign-extend the index
22910 if (NumElts == 8) {
22911 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22912 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
22913 N->getOperand(3), Index };
22914 DAG.UpdateNodeOperands(N, Ops);
22918 // Minimal number of elements in Gather
22921 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22922 Index = ExtendToType(Index, NewIndexVT, DAG);
22923 if (IndexVT.getScalarType() == MVT::i32)
22924 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22927 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
22928 // At this point we have promoted mask operand
22929 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22930 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22931 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22932 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
22934 // The pass-thru value
22935 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22936 Src0 = ExtendToType(Src0, NewVT, DAG);
22938 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
22939 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
22940 N->getMemoryVT(), dl, Ops,
22941 N->getMemOperand());
22942 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22943 NewGather.getValue(0),
22944 DAG.getIntPtrConstant(0, dl));
22945 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
22946 return DAG.getMergeValues(RetOps, dl);
22951 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
22952 SelectionDAG &DAG) const {
22953 // TODO: Eventually, the lowering of these nodes should be informed by or
22954 // deferred to the GC strategy for the function in which they appear. For
22955 // now, however, they must be lowered to something. Since they are logically
22956 // no-ops in the case of a null GC strategy (or a GC strategy which does not
22957 // require special handling for these nodes), lower them as literal NOOPs for
22959 SmallVector<SDValue, 2> Ops;
22961 Ops.push_back(Op.getOperand(0));
22962 if (Op->getGluedNode())
22963 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
22966 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
22967 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
22972 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
22973 SelectionDAG &DAG) const {
22974 // TODO: Eventually, the lowering of these nodes should be informed by or
22975 // deferred to the GC strategy for the function in which they appear. For
22976 // now, however, they must be lowered to something. Since they are logically
22977 // no-ops in the case of a null GC strategy (or a GC strategy which does not
22978 // require special handling for these nodes), lower them as literal NOOPs for
22980 SmallVector<SDValue, 2> Ops;
22982 Ops.push_back(Op.getOperand(0));
22983 if (Op->getGluedNode())
22984 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
22987 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
22988 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
22993 /// Provide custom lowering hooks for some operations.
22994 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
22995 switch (Op.getOpcode()) {
22996 default: llvm_unreachable("Should not custom lower this!");
22997 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
22998 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
22999 return LowerCMP_SWAP(Op, Subtarget, DAG);
23000 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23001 case ISD::ATOMIC_LOAD_ADD:
23002 case ISD::ATOMIC_LOAD_SUB:
23003 case ISD::ATOMIC_LOAD_OR:
23004 case ISD::ATOMIC_LOAD_XOR:
23005 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23006 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23007 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23008 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23009 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23010 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23011 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23012 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23013 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23014 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23015 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23016 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
23017 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23018 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23019 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23020 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23021 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23022 case ISD::SHL_PARTS:
23023 case ISD::SRA_PARTS:
23024 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23025 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23026 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23027 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23028 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23029 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23030 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23031 case ISD::ZERO_EXTEND_VECTOR_INREG:
23032 case ISD::SIGN_EXTEND_VECTOR_INREG:
23033 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23034 case ISD::FP_TO_SINT:
23035 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
23036 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23037 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23039 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23040 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23041 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23042 case ISD::SETCC: return LowerSETCC(Op, DAG);
23043 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23044 case ISD::SELECT: return LowerSELECT(Op, DAG);
23045 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23046 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23047 case ISD::VASTART: return LowerVASTART(Op, DAG);
23048 case ISD::VAARG: return LowerVAARG(Op, DAG);
23049 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23050 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23051 case ISD::INTRINSIC_VOID:
23052 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23053 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23054 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23055 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23056 case ISD::FRAME_TO_ARGS_OFFSET:
23057 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23058 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23059 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23060 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23061 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23062 case ISD::EH_SJLJ_SETUP_DISPATCH:
23063 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23064 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23065 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23066 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23068 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23070 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23071 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23073 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23074 case ISD::UMUL_LOHI:
23075 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23076 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23079 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23085 case ISD::UMULO: return LowerXALUO(Op, DAG);
23086 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23087 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23091 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23092 case ISD::ADD: return LowerADD(Op, DAG);
23093 case ISD::SUB: return LowerSUB(Op, DAG);
23097 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23098 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23099 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23100 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23101 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23102 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23103 case ISD::GC_TRANSITION_START:
23104 return LowerGC_TRANSITION_START(Op, DAG);
23105 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23106 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23110 /// Places new result values for the node in Results (their number
23111 /// and types must exactly match those of the original return values of
23112 /// the node), or leaves Results empty, which indicates that the node is not
23113 /// to be custom lowered after all.
23114 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23115 SmallVectorImpl<SDValue> &Results,
23116 SelectionDAG &DAG) const {
23117 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23119 if (!Res.getNode())
23122 assert((N->getNumValues() <= Res->getNumValues()) &&
23123 "Lowering returned the wrong number of results!");
23125 // Places new result values base on N result number.
23126 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23127 // than original node, chain should be dropped(last value).
23128 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23129 Results.push_back(Res.getValue(I));
23132 /// Replace a node with an illegal result type with a new node built out of
23134 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23135 SmallVectorImpl<SDValue>&Results,
23136 SelectionDAG &DAG) const {
23138 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23139 switch (N->getOpcode()) {
23141 llvm_unreachable("Do not know how to custom type legalize this operation!");
23142 case X86ISD::AVG: {
23143 // Legalize types for X86ISD::AVG by expanding vectors.
23144 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23146 auto InVT = N->getValueType(0);
23147 auto InVTSize = InVT.getSizeInBits();
23148 const unsigned RegSize =
23149 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23150 assert((Subtarget.hasBWI() || RegSize < 512) &&
23151 "512-bit vector requires AVX512BW");
23152 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23153 "256-bit vector requires AVX2");
23155 auto ElemVT = InVT.getVectorElementType();
23156 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23157 RegSize / ElemVT.getSizeInBits());
23158 assert(RegSize % InVT.getSizeInBits() == 0);
23159 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23161 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23162 Ops[0] = N->getOperand(0);
23163 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23164 Ops[0] = N->getOperand(1);
23165 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23167 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23168 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23169 DAG.getIntPtrConstant(0, dl)));
23172 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23173 case X86ISD::FMINC:
23175 case X86ISD::FMAXC:
23176 case X86ISD::FMAX: {
23177 EVT VT = N->getValueType(0);
23178 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23179 SDValue UNDEF = DAG.getUNDEF(VT);
23180 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23181 N->getOperand(0), UNDEF);
23182 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23183 N->getOperand(1), UNDEF);
23184 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23192 case ISD::UDIVREM: {
23193 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23194 Results.push_back(V);
23197 case ISD::FP_TO_SINT:
23198 case ISD::FP_TO_UINT: {
23199 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23201 if (N->getValueType(0) == MVT::v2i32) {
23202 assert((IsSigned || Subtarget.hasAVX512()) &&
23203 "Can only handle signed conversion without AVX512");
23204 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23205 SDValue Src = N->getOperand(0);
23206 if (Src.getValueType() == MVT::v2f64) {
23207 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23208 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23209 : X86ISD::CVTTP2UI,
23210 dl, MVT::v4i32, Src);
23211 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23212 Results.push_back(Res);
23215 if (Src.getValueType() == MVT::v2f32) {
23216 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23217 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23218 DAG.getUNDEF(MVT::v2f32));
23219 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23220 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23221 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23222 Results.push_back(Res);
23226 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23227 // so early out here.
23231 std::pair<SDValue,SDValue> Vals =
23232 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23233 SDValue FIST = Vals.first, StackSlot = Vals.second;
23234 if (FIST.getNode()) {
23235 EVT VT = N->getValueType(0);
23236 // Return a load from the stack slot.
23237 if (StackSlot.getNode())
23239 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23241 Results.push_back(FIST);
23245 case ISD::SINT_TO_FP: {
23246 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23247 SDValue Src = N->getOperand(0);
23248 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23250 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23253 case ISD::UINT_TO_FP: {
23254 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23255 EVT VT = N->getValueType(0);
23256 if (VT != MVT::v2f32)
23258 SDValue Src = N->getOperand(0);
23259 EVT SrcVT = Src.getValueType();
23260 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23261 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23264 if (SrcVT != MVT::v2i32)
23266 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23268 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23269 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23270 DAG.getBitcast(MVT::v2i64, VBias));
23271 Or = DAG.getBitcast(MVT::v2f64, Or);
23272 // TODO: Are there any fast-math-flags to propagate here?
23273 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23274 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23277 case ISD::FP_ROUND: {
23278 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23280 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23281 Results.push_back(V);
23284 case ISD::FP_EXTEND: {
23285 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23286 // No other ValueType for FP_EXTEND should reach this point.
23287 assert(N->getValueType(0) == MVT::v2f32 &&
23288 "Do not know how to legalize this Node");
23291 case ISD::INTRINSIC_W_CHAIN: {
23292 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
23294 default : llvm_unreachable("Do not know how to custom type "
23295 "legalize this intrinsic operation!");
23296 case Intrinsic::x86_rdtsc:
23297 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23299 case Intrinsic::x86_rdtscp:
23300 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
23302 case Intrinsic::x86_rdpmc:
23303 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
23305 case Intrinsic::x86_xgetbv:
23306 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
23309 case ISD::INTRINSIC_WO_CHAIN: {
23310 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
23311 Results.push_back(V);
23314 case ISD::READCYCLECOUNTER: {
23315 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23318 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
23319 EVT T = N->getValueType(0);
23320 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
23321 bool Regs64bit = T == MVT::i128;
23322 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
23323 SDValue cpInL, cpInH;
23324 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23325 DAG.getConstant(0, dl, HalfT));
23326 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23327 DAG.getConstant(1, dl, HalfT));
23328 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
23329 Regs64bit ? X86::RAX : X86::EAX,
23331 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
23332 Regs64bit ? X86::RDX : X86::EDX,
23333 cpInH, cpInL.getValue(1));
23334 SDValue swapInL, swapInH;
23335 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23336 DAG.getConstant(0, dl, HalfT));
23337 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23338 DAG.getConstant(1, dl, HalfT));
23340 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
23341 swapInH, cpInH.getValue(1));
23342 // If the current function needs the base pointer, RBX,
23343 // we shouldn't use cmpxchg directly.
23344 // Indeed the lowering of that instruction will clobber
23345 // that register and since RBX will be a reserved register
23346 // the register allocator will not make sure its value will
23347 // be properly saved and restored around this live-range.
23348 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
23350 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23351 unsigned BasePtr = TRI->getBaseRegister();
23352 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
23353 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
23354 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
23355 // ISel prefers the LCMPXCHG64 variant.
23356 // If that assert breaks, that means it is not the case anymore,
23357 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
23358 // not just EBX. This is a matter of accepting i64 input for that
23359 // pseudo, and restoring into the register of the right wide
23360 // in expand pseudo. Everything else should just work.
23361 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
23362 "Saving only half of the RBX");
23363 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
23364 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
23365 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
23366 Regs64bit ? X86::RBX : X86::EBX,
23367 HalfT, swapInH.getValue(1));
23368 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
23370 /*Glue*/ RBXSave.getValue(2)};
23371 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23374 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
23375 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
23376 Regs64bit ? X86::RBX : X86::EBX, swapInL,
23377 swapInH.getValue(1));
23378 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
23379 swapInL.getValue(1)};
23380 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23382 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
23383 Regs64bit ? X86::RAX : X86::EAX,
23384 HalfT, Result.getValue(1));
23385 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
23386 Regs64bit ? X86::RDX : X86::EDX,
23387 HalfT, cpOutL.getValue(2));
23388 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
23390 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
23391 MVT::i32, cpOutH.getValue(2));
23392 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
23393 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
23395 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
23396 Results.push_back(Success);
23397 Results.push_back(EFLAGS.getValue(1));
23400 case ISD::ATOMIC_SWAP:
23401 case ISD::ATOMIC_LOAD_ADD:
23402 case ISD::ATOMIC_LOAD_SUB:
23403 case ISD::ATOMIC_LOAD_AND:
23404 case ISD::ATOMIC_LOAD_OR:
23405 case ISD::ATOMIC_LOAD_XOR:
23406 case ISD::ATOMIC_LOAD_NAND:
23407 case ISD::ATOMIC_LOAD_MIN:
23408 case ISD::ATOMIC_LOAD_MAX:
23409 case ISD::ATOMIC_LOAD_UMIN:
23410 case ISD::ATOMIC_LOAD_UMAX:
23411 case ISD::ATOMIC_LOAD: {
23412 // Delegate to generic TypeLegalization. Situations we can really handle
23413 // should have already been dealt with by AtomicExpandPass.cpp.
23416 case ISD::BITCAST: {
23417 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23418 EVT DstVT = N->getValueType(0);
23419 EVT SrcVT = N->getOperand(0)->getValueType(0);
23421 if (SrcVT != MVT::f64 ||
23422 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
23425 unsigned NumElts = DstVT.getVectorNumElements();
23426 EVT SVT = DstVT.getVectorElementType();
23427 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23428 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
23429 MVT::v2f64, N->getOperand(0));
23430 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
23432 if (ExperimentalVectorWideningLegalization) {
23433 // If we are legalizing vectors by widening, we already have the desired
23434 // legal vector type, just return it.
23435 Results.push_back(ToVecInt);
23439 SmallVector<SDValue, 8> Elts;
23440 for (unsigned i = 0, e = NumElts; i != e; ++i)
23441 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
23442 ToVecInt, DAG.getIntPtrConstant(i, dl)));
23444 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
23449 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
23450 switch ((X86ISD::NodeType)Opcode) {
23451 case X86ISD::FIRST_NUMBER: break;
23452 case X86ISD::BSF: return "X86ISD::BSF";
23453 case X86ISD::BSR: return "X86ISD::BSR";
23454 case X86ISD::SHLD: return "X86ISD::SHLD";
23455 case X86ISD::SHRD: return "X86ISD::SHRD";
23456 case X86ISD::FAND: return "X86ISD::FAND";
23457 case X86ISD::FANDN: return "X86ISD::FANDN";
23458 case X86ISD::FOR: return "X86ISD::FOR";
23459 case X86ISD::FXOR: return "X86ISD::FXOR";
23460 case X86ISD::FILD: return "X86ISD::FILD";
23461 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
23462 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
23463 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
23464 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
23465 case X86ISD::FLD: return "X86ISD::FLD";
23466 case X86ISD::FST: return "X86ISD::FST";
23467 case X86ISD::CALL: return "X86ISD::CALL";
23468 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
23469 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
23470 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
23471 case X86ISD::BT: return "X86ISD::BT";
23472 case X86ISD::CMP: return "X86ISD::CMP";
23473 case X86ISD::COMI: return "X86ISD::COMI";
23474 case X86ISD::UCOMI: return "X86ISD::UCOMI";
23475 case X86ISD::CMPM: return "X86ISD::CMPM";
23476 case X86ISD::CMPMU: return "X86ISD::CMPMU";
23477 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
23478 case X86ISD::SETCC: return "X86ISD::SETCC";
23479 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
23480 case X86ISD::FSETCC: return "X86ISD::FSETCC";
23481 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
23482 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
23483 case X86ISD::CMOV: return "X86ISD::CMOV";
23484 case X86ISD::BRCOND: return "X86ISD::BRCOND";
23485 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
23486 case X86ISD::IRET: return "X86ISD::IRET";
23487 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
23488 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
23489 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
23490 case X86ISD::Wrapper: return "X86ISD::Wrapper";
23491 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
23492 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
23493 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
23494 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
23495 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
23496 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
23497 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
23498 case X86ISD::PINSRB: return "X86ISD::PINSRB";
23499 case X86ISD::PINSRW: return "X86ISD::PINSRW";
23500 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
23501 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
23502 case X86ISD::ANDNP: return "X86ISD::ANDNP";
23503 case X86ISD::BLENDI: return "X86ISD::BLENDI";
23504 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
23505 case X86ISD::ADDUS: return "X86ISD::ADDUS";
23506 case X86ISD::SUBUS: return "X86ISD::SUBUS";
23507 case X86ISD::HADD: return "X86ISD::HADD";
23508 case X86ISD::HSUB: return "X86ISD::HSUB";
23509 case X86ISD::FHADD: return "X86ISD::FHADD";
23510 case X86ISD::FHSUB: return "X86ISD::FHSUB";
23511 case X86ISD::ABS: return "X86ISD::ABS";
23512 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
23513 case X86ISD::FMAX: return "X86ISD::FMAX";
23514 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
23515 case X86ISD::FMIN: return "X86ISD::FMIN";
23516 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
23517 case X86ISD::FMAXC: return "X86ISD::FMAXC";
23518 case X86ISD::FMINC: return "X86ISD::FMINC";
23519 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
23520 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
23521 case X86ISD::FRCP: return "X86ISD::FRCP";
23522 case X86ISD::FRCPS: return "X86ISD::FRCPS";
23523 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
23524 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
23525 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
23526 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
23527 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
23528 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
23529 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
23530 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
23531 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
23532 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
23533 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
23534 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
23535 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
23536 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
23537 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
23538 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
23539 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
23540 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
23541 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
23542 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
23543 case X86ISD::LADD: return "X86ISD::LADD";
23544 case X86ISD::LSUB: return "X86ISD::LSUB";
23545 case X86ISD::LOR: return "X86ISD::LOR";
23546 case X86ISD::LXOR: return "X86ISD::LXOR";
23547 case X86ISD::LAND: return "X86ISD::LAND";
23548 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
23549 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
23550 case X86ISD::VZEXT: return "X86ISD::VZEXT";
23551 case X86ISD::VSEXT: return "X86ISD::VSEXT";
23552 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
23553 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
23554 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
23555 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
23556 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
23557 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
23558 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
23559 case X86ISD::VINSERT: return "X86ISD::VINSERT";
23560 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
23561 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
23562 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
23563 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
23564 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
23565 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
23566 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
23567 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
23568 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
23569 case X86ISD::VSHL: return "X86ISD::VSHL";
23570 case X86ISD::VSRL: return "X86ISD::VSRL";
23571 case X86ISD::VSRA: return "X86ISD::VSRA";
23572 case X86ISD::VSHLI: return "X86ISD::VSHLI";
23573 case X86ISD::VSRLI: return "X86ISD::VSRLI";
23574 case X86ISD::VSRAI: return "X86ISD::VSRAI";
23575 case X86ISD::VSRAV: return "X86ISD::VSRAV";
23576 case X86ISD::VROTLI: return "X86ISD::VROTLI";
23577 case X86ISD::VROTRI: return "X86ISD::VROTRI";
23578 case X86ISD::VPPERM: return "X86ISD::VPPERM";
23579 case X86ISD::CMPP: return "X86ISD::CMPP";
23580 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
23581 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
23582 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
23583 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
23584 case X86ISD::ADD: return "X86ISD::ADD";
23585 case X86ISD::SUB: return "X86ISD::SUB";
23586 case X86ISD::ADC: return "X86ISD::ADC";
23587 case X86ISD::SBB: return "X86ISD::SBB";
23588 case X86ISD::SMUL: return "X86ISD::SMUL";
23589 case X86ISD::UMUL: return "X86ISD::UMUL";
23590 case X86ISD::SMUL8: return "X86ISD::SMUL8";
23591 case X86ISD::UMUL8: return "X86ISD::UMUL8";
23592 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
23593 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
23594 case X86ISD::INC: return "X86ISD::INC";
23595 case X86ISD::DEC: return "X86ISD::DEC";
23596 case X86ISD::OR: return "X86ISD::OR";
23597 case X86ISD::XOR: return "X86ISD::XOR";
23598 case X86ISD::AND: return "X86ISD::AND";
23599 case X86ISD::BEXTR: return "X86ISD::BEXTR";
23600 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
23601 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
23602 case X86ISD::PTEST: return "X86ISD::PTEST";
23603 case X86ISD::TESTP: return "X86ISD::TESTP";
23604 case X86ISD::TESTM: return "X86ISD::TESTM";
23605 case X86ISD::TESTNM: return "X86ISD::TESTNM";
23606 case X86ISD::KORTEST: return "X86ISD::KORTEST";
23607 case X86ISD::KTEST: return "X86ISD::KTEST";
23608 case X86ISD::PACKSS: return "X86ISD::PACKSS";
23609 case X86ISD::PACKUS: return "X86ISD::PACKUS";
23610 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
23611 case X86ISD::VALIGN: return "X86ISD::VALIGN";
23612 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
23613 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
23614 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
23615 case X86ISD::SHUFP: return "X86ISD::SHUFP";
23616 case X86ISD::SHUF128: return "X86ISD::SHUF128";
23617 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
23618 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
23619 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
23620 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
23621 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
23622 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
23623 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
23624 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
23625 case X86ISD::MOVSD: return "X86ISD::MOVSD";
23626 case X86ISD::MOVSS: return "X86ISD::MOVSS";
23627 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
23628 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
23629 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
23630 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
23631 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
23632 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
23633 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
23634 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
23635 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
23636 case X86ISD::VPERMV: return "X86ISD::VPERMV";
23637 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
23638 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
23639 case X86ISD::VPERMI: return "X86ISD::VPERMI";
23640 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
23641 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
23642 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
23643 case X86ISD::VRANGE: return "X86ISD::VRANGE";
23644 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
23645 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
23646 case X86ISD::PSADBW: return "X86ISD::PSADBW";
23647 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
23648 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
23649 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
23650 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
23651 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
23652 case X86ISD::MFENCE: return "X86ISD::MFENCE";
23653 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
23654 case X86ISD::SAHF: return "X86ISD::SAHF";
23655 case X86ISD::RDRAND: return "X86ISD::RDRAND";
23656 case X86ISD::RDSEED: return "X86ISD::RDSEED";
23657 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
23658 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
23659 case X86ISD::VPROT: return "X86ISD::VPROT";
23660 case X86ISD::VPROTI: return "X86ISD::VPROTI";
23661 case X86ISD::VPSHA: return "X86ISD::VPSHA";
23662 case X86ISD::VPSHL: return "X86ISD::VPSHL";
23663 case X86ISD::VPCOM: return "X86ISD::VPCOM";
23664 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
23665 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
23666 case X86ISD::FMADD: return "X86ISD::FMADD";
23667 case X86ISD::FMSUB: return "X86ISD::FMSUB";
23668 case X86ISD::FNMADD: return "X86ISD::FNMADD";
23669 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
23670 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
23671 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
23672 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
23673 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
23674 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
23675 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
23676 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
23677 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
23678 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
23679 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
23680 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
23681 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
23682 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
23683 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
23684 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
23685 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
23686 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
23687 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
23688 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
23689 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
23690 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
23691 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
23692 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
23693 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
23694 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
23695 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
23696 case X86ISD::XTEST: return "X86ISD::XTEST";
23697 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
23698 case X86ISD::EXPAND: return "X86ISD::EXPAND";
23699 case X86ISD::SELECT: return "X86ISD::SELECT";
23700 case X86ISD::SELECTS: return "X86ISD::SELECTS";
23701 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
23702 case X86ISD::RCP28: return "X86ISD::RCP28";
23703 case X86ISD::RCP28S: return "X86ISD::RCP28S";
23704 case X86ISD::EXP2: return "X86ISD::EXP2";
23705 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
23706 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
23707 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
23708 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
23709 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
23710 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
23711 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
23712 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
23713 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
23714 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
23715 case X86ISD::SCALEF: return "X86ISD::SCALEF";
23716 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
23717 case X86ISD::ADDS: return "X86ISD::ADDS";
23718 case X86ISD::SUBS: return "X86ISD::SUBS";
23719 case X86ISD::AVG: return "X86ISD::AVG";
23720 case X86ISD::MULHRS: return "X86ISD::MULHRS";
23721 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
23722 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
23723 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
23724 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
23725 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
23726 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
23727 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
23728 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
23729 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
23730 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
23731 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
23732 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
23733 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
23734 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
23735 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
23736 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
23737 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
23738 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
23739 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
23740 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
23741 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
23742 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
23743 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
23748 /// Return true if the addressing mode represented by AM is legal for this
23749 /// target, for a load/store of the specified type.
23750 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
23751 const AddrMode &AM, Type *Ty,
23752 unsigned AS) const {
23753 // X86 supports extremely general addressing modes.
23754 CodeModel::Model M = getTargetMachine().getCodeModel();
23756 // X86 allows a sign-extended 32-bit immediate field as a displacement.
23757 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
23761 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
23763 // If a reference to this global requires an extra load, we can't fold it.
23764 if (isGlobalStubReference(GVFlags))
23767 // If BaseGV requires a register for the PIC base, we cannot also have a
23768 // BaseReg specified.
23769 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
23772 // If lower 4G is not available, then we must use rip-relative addressing.
23773 if ((M != CodeModel::Small || isPositionIndependent()) &&
23774 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
23778 switch (AM.Scale) {
23784 // These scales always work.
23789 // These scales are formed with basereg+scalereg. Only accept if there is
23794 default: // Other stuff never works.
23801 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
23802 unsigned Bits = Ty->getScalarSizeInBits();
23804 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
23805 // particularly cheaper than those without.
23809 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
23810 // variable shifts just as cheap as scalar ones.
23811 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
23814 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
23815 // fully general vector.
23819 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
23820 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23822 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
23823 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
23824 return NumBits1 > NumBits2;
23827 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
23828 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23831 if (!isTypeLegal(EVT::getEVT(Ty1)))
23834 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
23836 // Assuming the caller doesn't have a zeroext or signext return parameter,
23837 // truncation all the way down to i1 is valid.
23841 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
23842 return isInt<32>(Imm);
23845 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
23846 // Can also use sub to handle negated immediates.
23847 return isInt<32>(Imm);
23850 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
23851 if (!VT1.isInteger() || !VT2.isInteger())
23853 unsigned NumBits1 = VT1.getSizeInBits();
23854 unsigned NumBits2 = VT2.getSizeInBits();
23855 return NumBits1 > NumBits2;
23858 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
23859 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23860 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
23863 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
23864 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23865 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
23868 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
23869 EVT VT1 = Val.getValueType();
23870 if (isZExtFree(VT1, VT2))
23873 if (Val.getOpcode() != ISD::LOAD)
23876 if (!VT1.isSimple() || !VT1.isInteger() ||
23877 !VT2.isSimple() || !VT2.isInteger())
23880 switch (VT1.getSimpleVT().SimpleTy) {
23885 // X86 has 8, 16, and 32-bit zero-extending loads.
23892 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
23895 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
23896 if (!Subtarget.hasAnyFMA())
23899 VT = VT.getScalarType();
23901 if (!VT.isSimple())
23904 switch (VT.getSimpleVT().SimpleTy) {
23915 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
23916 // i16 instructions are longer (0x66 prefix) and potentially slower.
23917 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
23920 /// Targets can use this to indicate that they only support *some*
23921 /// VECTOR_SHUFFLE operations, those with specific masks.
23922 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
23923 /// are assumed to be legal.
23925 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
23927 if (!VT.isSimple())
23930 // Not for i1 vectors
23931 if (VT.getSimpleVT().getScalarType() == MVT::i1)
23934 // Very little shuffling can be done for 64-bit vectors right now.
23935 if (VT.getSimpleVT().getSizeInBits() == 64)
23938 // We only care that the types being shuffled are legal. The lowering can
23939 // handle any possible shuffle mask that results.
23940 return isTypeLegal(VT.getSimpleVT());
23944 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
23946 // Just delegate to the generic legality, clear masks aren't special.
23947 return isShuffleMaskLegal(Mask, VT);
23950 //===----------------------------------------------------------------------===//
23951 // X86 Scheduler Hooks
23952 //===----------------------------------------------------------------------===//
23954 /// Utility function to emit xbegin specifying the start of an RTM region.
23955 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
23956 const TargetInstrInfo *TII) {
23957 DebugLoc DL = MI.getDebugLoc();
23959 const BasicBlock *BB = MBB->getBasicBlock();
23960 MachineFunction::iterator I = ++MBB->getIterator();
23962 // For the v = xbegin(), we generate
23973 MachineBasicBlock *thisMBB = MBB;
23974 MachineFunction *MF = MBB->getParent();
23975 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23976 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23977 MF->insert(I, mainMBB);
23978 MF->insert(I, sinkMBB);
23980 // Transfer the remainder of BB and its successor edges to sinkMBB.
23981 sinkMBB->splice(sinkMBB->begin(), MBB,
23982 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23983 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23987 // # fallthrough to mainMBB
23988 // # abortion to sinkMBB
23989 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
23990 thisMBB->addSuccessor(mainMBB);
23991 thisMBB->addSuccessor(sinkMBB);
23995 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
23996 mainMBB->addSuccessor(sinkMBB);
23999 // EAX is live into the sinkMBB
24000 sinkMBB->addLiveIn(X86::EAX);
24001 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
24002 MI.getOperand(0).getReg())
24005 MI.eraseFromParent();
24009 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24010 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24011 // in the .td file.
24012 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24013 const TargetInstrInfo *TII) {
24015 switch (MI.getOpcode()) {
24016 default: llvm_unreachable("illegal opcode!");
24017 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24018 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24019 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24020 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24021 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24022 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24023 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24024 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24027 DebugLoc dl = MI.getDebugLoc();
24028 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24030 unsigned NumArgs = MI.getNumOperands();
24031 for (unsigned i = 1; i < NumArgs; ++i) {
24032 MachineOperand &Op = MI.getOperand(i);
24033 if (!(Op.isReg() && Op.isImplicit()))
24034 MIB.addOperand(Op);
24036 if (MI.hasOneMemOperand())
24037 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24039 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24040 .addReg(X86::XMM0);
24042 MI.eraseFromParent();
24046 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24047 // defs in an instruction pattern
24048 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24049 const TargetInstrInfo *TII) {
24051 switch (MI.getOpcode()) {
24052 default: llvm_unreachable("illegal opcode!");
24053 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24054 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24055 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24056 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24057 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24058 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24059 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24060 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24063 DebugLoc dl = MI.getDebugLoc();
24064 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24066 unsigned NumArgs = MI.getNumOperands(); // remove the results
24067 for (unsigned i = 1; i < NumArgs; ++i) {
24068 MachineOperand &Op = MI.getOperand(i);
24069 if (!(Op.isReg() && Op.isImplicit()))
24070 MIB.addOperand(Op);
24072 if (MI.hasOneMemOperand())
24073 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24075 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24078 MI.eraseFromParent();
24082 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24083 const X86Subtarget &Subtarget) {
24084 DebugLoc dl = MI.getDebugLoc();
24085 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24087 // insert input VAL into EAX
24088 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24089 .addReg(MI.getOperand(0).getReg());
24090 // insert zero to ECX
24091 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24093 // insert zero to EDX
24094 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24096 // insert WRPKRU instruction
24097 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24099 MI.eraseFromParent(); // The pseudo is gone now.
24103 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24104 const X86Subtarget &Subtarget) {
24105 DebugLoc dl = MI.getDebugLoc();
24106 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24108 // insert zero to ECX
24109 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24111 // insert RDPKRU instruction
24112 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24113 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24116 MI.eraseFromParent(); // The pseudo is gone now.
24120 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24121 const X86Subtarget &Subtarget,
24123 DebugLoc dl = MI.getDebugLoc();
24124 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24125 // Address into RAX/EAX, other two args into ECX, EDX.
24126 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24127 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24128 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24129 for (int i = 0; i < X86::AddrNumOperands; ++i)
24130 MIB.addOperand(MI.getOperand(i));
24132 unsigned ValOps = X86::AddrNumOperands;
24133 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24134 .addReg(MI.getOperand(ValOps).getReg());
24135 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24136 .addReg(MI.getOperand(ValOps + 1).getReg());
24138 // The instruction doesn't actually take any operands though.
24139 BuildMI(*BB, MI, dl, TII->get(Opc));
24141 MI.eraseFromParent(); // The pseudo is gone now.
24145 MachineBasicBlock *
24146 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24147 MachineBasicBlock *MBB) const {
24148 // Emit va_arg instruction on X86-64.
24150 // Operands to this pseudo-instruction:
24151 // 0 ) Output : destination address (reg)
24152 // 1-5) Input : va_list address (addr, i64mem)
24153 // 6 ) ArgSize : Size (in bytes) of vararg type
24154 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24155 // 8 ) Align : Alignment of type
24156 // 9 ) EFLAGS (implicit-def)
24158 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24159 static_assert(X86::AddrNumOperands == 5,
24160 "VAARG_64 assumes 5 address operands");
24162 unsigned DestReg = MI.getOperand(0).getReg();
24163 MachineOperand &Base = MI.getOperand(1);
24164 MachineOperand &Scale = MI.getOperand(2);
24165 MachineOperand &Index = MI.getOperand(3);
24166 MachineOperand &Disp = MI.getOperand(4);
24167 MachineOperand &Segment = MI.getOperand(5);
24168 unsigned ArgSize = MI.getOperand(6).getImm();
24169 unsigned ArgMode = MI.getOperand(7).getImm();
24170 unsigned Align = MI.getOperand(8).getImm();
24172 // Memory Reference
24173 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24174 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24175 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24177 // Machine Information
24178 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24179 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24180 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24181 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24182 DebugLoc DL = MI.getDebugLoc();
24184 // struct va_list {
24187 // i64 overflow_area (address)
24188 // i64 reg_save_area (address)
24190 // sizeof(va_list) = 24
24191 // alignment(va_list) = 8
24193 unsigned TotalNumIntRegs = 6;
24194 unsigned TotalNumXMMRegs = 8;
24195 bool UseGPOffset = (ArgMode == 1);
24196 bool UseFPOffset = (ArgMode == 2);
24197 unsigned MaxOffset = TotalNumIntRegs * 8 +
24198 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24200 /* Align ArgSize to a multiple of 8 */
24201 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24202 bool NeedsAlign = (Align > 8);
24204 MachineBasicBlock *thisMBB = MBB;
24205 MachineBasicBlock *overflowMBB;
24206 MachineBasicBlock *offsetMBB;
24207 MachineBasicBlock *endMBB;
24209 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24210 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24211 unsigned OffsetReg = 0;
24213 if (!UseGPOffset && !UseFPOffset) {
24214 // If we only pull from the overflow region, we don't create a branch.
24215 // We don't need to alter control flow.
24216 OffsetDestReg = 0; // unused
24217 OverflowDestReg = DestReg;
24219 offsetMBB = nullptr;
24220 overflowMBB = thisMBB;
24223 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24224 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24225 // If not, pull from overflow_area. (branch to overflowMBB)
24230 // offsetMBB overflowMBB
24235 // Registers for the PHI in endMBB
24236 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24237 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24239 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24240 MachineFunction *MF = MBB->getParent();
24241 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24242 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24243 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24245 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24247 // Insert the new basic blocks
24248 MF->insert(MBBIter, offsetMBB);
24249 MF->insert(MBBIter, overflowMBB);
24250 MF->insert(MBBIter, endMBB);
24252 // Transfer the remainder of MBB and its successor edges to endMBB.
24253 endMBB->splice(endMBB->begin(), thisMBB,
24254 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
24255 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
24257 // Make offsetMBB and overflowMBB successors of thisMBB
24258 thisMBB->addSuccessor(offsetMBB);
24259 thisMBB->addSuccessor(overflowMBB);
24261 // endMBB is a successor of both offsetMBB and overflowMBB
24262 offsetMBB->addSuccessor(endMBB);
24263 overflowMBB->addSuccessor(endMBB);
24265 // Load the offset value into a register
24266 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24267 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
24271 .addDisp(Disp, UseFPOffset ? 4 : 0)
24272 .addOperand(Segment)
24273 .setMemRefs(MMOBegin, MMOEnd);
24275 // Check if there is enough room left to pull this argument.
24276 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
24278 .addImm(MaxOffset + 8 - ArgSizeA8);
24280 // Branch to "overflowMBB" if offset >= max
24281 // Fall through to "offsetMBB" otherwise
24282 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
24283 .addMBB(overflowMBB);
24286 // In offsetMBB, emit code to use the reg_save_area.
24288 assert(OffsetReg != 0);
24290 // Read the reg_save_area address.
24291 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
24292 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
24297 .addOperand(Segment)
24298 .setMemRefs(MMOBegin, MMOEnd);
24300 // Zero-extend the offset
24301 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
24302 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
24305 .addImm(X86::sub_32bit);
24307 // Add the offset to the reg_save_area to get the final address.
24308 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
24309 .addReg(OffsetReg64)
24310 .addReg(RegSaveReg);
24312 // Compute the offset for the next argument
24313 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24314 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
24316 .addImm(UseFPOffset ? 16 : 8);
24318 // Store it back into the va_list.
24319 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
24323 .addDisp(Disp, UseFPOffset ? 4 : 0)
24324 .addOperand(Segment)
24325 .addReg(NextOffsetReg)
24326 .setMemRefs(MMOBegin, MMOEnd);
24329 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
24334 // Emit code to use overflow area
24337 // Load the overflow_area address into a register.
24338 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
24339 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
24344 .addOperand(Segment)
24345 .setMemRefs(MMOBegin, MMOEnd);
24347 // If we need to align it, do so. Otherwise, just copy the address
24348 // to OverflowDestReg.
24350 // Align the overflow address
24351 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
24352 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
24354 // aligned_addr = (addr + (align-1)) & ~(align-1)
24355 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
24356 .addReg(OverflowAddrReg)
24359 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
24361 .addImm(~(uint64_t)(Align-1));
24363 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
24364 .addReg(OverflowAddrReg);
24367 // Compute the next overflow address after this argument.
24368 // (the overflow address should be kept 8-byte aligned)
24369 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
24370 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
24371 .addReg(OverflowDestReg)
24372 .addImm(ArgSizeA8);
24374 // Store the new overflow address.
24375 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
24380 .addOperand(Segment)
24381 .addReg(NextAddrReg)
24382 .setMemRefs(MMOBegin, MMOEnd);
24384 // If we branched, emit the PHI to the front of endMBB.
24386 BuildMI(*endMBB, endMBB->begin(), DL,
24387 TII->get(X86::PHI), DestReg)
24388 .addReg(OffsetDestReg).addMBB(offsetMBB)
24389 .addReg(OverflowDestReg).addMBB(overflowMBB);
24392 // Erase the pseudo instruction
24393 MI.eraseFromParent();
24398 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
24399 MachineInstr &MI, MachineBasicBlock *MBB) const {
24400 // Emit code to save XMM registers to the stack. The ABI says that the
24401 // number of registers to save is given in %al, so it's theoretically
24402 // possible to do an indirect jump trick to avoid saving all of them,
24403 // however this code takes a simpler approach and just executes all
24404 // of the stores if %al is non-zero. It's less code, and it's probably
24405 // easier on the hardware branch predictor, and stores aren't all that
24406 // expensive anyway.
24408 // Create the new basic blocks. One block contains all the XMM stores,
24409 // and one block is the final destination regardless of whether any
24410 // stores were performed.
24411 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24412 MachineFunction *F = MBB->getParent();
24413 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24414 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
24415 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
24416 F->insert(MBBIter, XMMSaveMBB);
24417 F->insert(MBBIter, EndMBB);
24419 // Transfer the remainder of MBB and its successor edges to EndMBB.
24420 EndMBB->splice(EndMBB->begin(), MBB,
24421 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24422 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
24424 // The original block will now fall through to the XMM save block.
24425 MBB->addSuccessor(XMMSaveMBB);
24426 // The XMMSaveMBB will fall through to the end block.
24427 XMMSaveMBB->addSuccessor(EndMBB);
24429 // Now add the instructions.
24430 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24431 DebugLoc DL = MI.getDebugLoc();
24433 unsigned CountReg = MI.getOperand(0).getReg();
24434 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
24435 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
24437 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
24438 // If %al is 0, branch around the XMM save block.
24439 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
24440 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
24441 MBB->addSuccessor(EndMBB);
24444 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
24445 // that was just emitted, but clearly shouldn't be "saved".
24446 assert((MI.getNumOperands() <= 3 ||
24447 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
24448 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
24449 "Expected last argument to be EFLAGS");
24450 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
24451 // In the XMM save block, save all the XMM argument registers.
24452 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
24453 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
24454 MachineMemOperand *MMO = F->getMachineMemOperand(
24455 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
24456 MachineMemOperand::MOStore,
24457 /*Size=*/16, /*Align=*/16);
24458 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
24459 .addFrameIndex(RegSaveFrameIndex)
24460 .addImm(/*Scale=*/1)
24461 .addReg(/*IndexReg=*/0)
24462 .addImm(/*Disp=*/Offset)
24463 .addReg(/*Segment=*/0)
24464 .addReg(MI.getOperand(i).getReg())
24465 .addMemOperand(MMO);
24468 MI.eraseFromParent(); // The pseudo instruction is gone now.
24473 // The EFLAGS operand of SelectItr might be missing a kill marker
24474 // because there were multiple uses of EFLAGS, and ISel didn't know
24475 // which to mark. Figure out whether SelectItr should have had a
24476 // kill marker, and set it if it should. Returns the correct kill
24478 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
24479 MachineBasicBlock* BB,
24480 const TargetRegisterInfo* TRI) {
24481 // Scan forward through BB for a use/def of EFLAGS.
24482 MachineBasicBlock::iterator miI(std::next(SelectItr));
24483 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
24484 const MachineInstr& mi = *miI;
24485 if (mi.readsRegister(X86::EFLAGS))
24487 if (mi.definesRegister(X86::EFLAGS))
24488 break; // Should have kill-flag - update below.
24491 // If we hit the end of the block, check whether EFLAGS is live into a
24493 if (miI == BB->end()) {
24494 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
24495 sEnd = BB->succ_end();
24496 sItr != sEnd; ++sItr) {
24497 MachineBasicBlock* succ = *sItr;
24498 if (succ->isLiveIn(X86::EFLAGS))
24503 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
24504 // out. SelectMI should have a kill flag on EFLAGS.
24505 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
24509 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
24510 // together with other CMOV pseudo-opcodes into a single basic-block with
24511 // conditional jump around it.
24512 static bool isCMOVPseudo(MachineInstr &MI) {
24513 switch (MI.getOpcode()) {
24514 case X86::CMOV_FR32:
24515 case X86::CMOV_FR64:
24516 case X86::CMOV_GR8:
24517 case X86::CMOV_GR16:
24518 case X86::CMOV_GR32:
24519 case X86::CMOV_RFP32:
24520 case X86::CMOV_RFP64:
24521 case X86::CMOV_RFP80:
24522 case X86::CMOV_V2F64:
24523 case X86::CMOV_V2I64:
24524 case X86::CMOV_V4F32:
24525 case X86::CMOV_V4F64:
24526 case X86::CMOV_V4I64:
24527 case X86::CMOV_V16F32:
24528 case X86::CMOV_V8F32:
24529 case X86::CMOV_V8F64:
24530 case X86::CMOV_V8I64:
24531 case X86::CMOV_V8I1:
24532 case X86::CMOV_V16I1:
24533 case X86::CMOV_V32I1:
24534 case X86::CMOV_V64I1:
24542 MachineBasicBlock *
24543 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
24544 MachineBasicBlock *BB) const {
24545 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24546 DebugLoc DL = MI.getDebugLoc();
24548 // To "insert" a SELECT_CC instruction, we actually have to insert the
24549 // diamond control-flow pattern. The incoming instruction knows the
24550 // destination vreg to set, the condition code register to branch on, the
24551 // true/false values to select between, and a branch opcode to use.
24552 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24553 MachineFunction::iterator It = ++BB->getIterator();
24558 // cmpTY ccX, r1, r2
24560 // fallthrough --> copy0MBB
24561 MachineBasicBlock *thisMBB = BB;
24562 MachineFunction *F = BB->getParent();
24564 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
24565 // as described above, by inserting a BB, and then making a PHI at the join
24566 // point to select the true and false operands of the CMOV in the PHI.
24568 // The code also handles two different cases of multiple CMOV opcodes
24572 // In this case, there are multiple CMOVs in a row, all which are based on
24573 // the same condition setting (or the exact opposite condition setting).
24574 // In this case we can lower all the CMOVs using a single inserted BB, and
24575 // then make a number of PHIs at the join point to model the CMOVs. The only
24576 // trickiness here, is that in a case like:
24578 // t2 = CMOV cond1 t1, f1
24579 // t3 = CMOV cond1 t2, f2
24581 // when rewriting this into PHIs, we have to perform some renaming on the
24582 // temps since you cannot have a PHI operand refer to a PHI result earlier
24583 // in the same block. The "simple" but wrong lowering would be:
24585 // t2 = PHI t1(BB1), f1(BB2)
24586 // t3 = PHI t2(BB1), f2(BB2)
24588 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
24589 // renaming is to note that on the path through BB1, t2 is really just a
24590 // copy of t1, and do that renaming, properly generating:
24592 // t2 = PHI t1(BB1), f1(BB2)
24593 // t3 = PHI t1(BB1), f2(BB2)
24595 // Case 2, we lower cascaded CMOVs such as
24597 // (CMOV (CMOV F, T, cc1), T, cc2)
24599 // to two successives branches. For that, we look for another CMOV as the
24600 // following instruction.
24602 // Without this, we would add a PHI between the two jumps, which ends up
24603 // creating a few copies all around. For instance, for
24605 // (sitofp (zext (fcmp une)))
24607 // we would generate:
24609 // ucomiss %xmm1, %xmm0
24610 // movss <1.0f>, %xmm0
24611 // movaps %xmm0, %xmm1
24613 // xorps %xmm1, %xmm1
24616 // movaps %xmm1, %xmm0
24620 // because this custom-inserter would have generated:
24632 // A: X = ...; Y = ...
24634 // C: Z = PHI [X, A], [Y, B]
24636 // E: PHI [X, C], [Z, D]
24638 // If we lower both CMOVs in a single step, we can instead generate:
24650 // A: X = ...; Y = ...
24652 // E: PHI [X, A], [X, C], [Y, D]
24654 // Which, in our sitofp/fcmp example, gives us something like:
24656 // ucomiss %xmm1, %xmm0
24657 // movss <1.0f>, %xmm0
24660 // xorps %xmm0, %xmm0
24664 MachineInstr *CascadedCMOV = nullptr;
24665 MachineInstr *LastCMOV = &MI;
24666 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
24667 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
24668 MachineBasicBlock::iterator NextMIIt =
24669 std::next(MachineBasicBlock::iterator(MI));
24671 // Check for case 1, where there are multiple CMOVs with the same condition
24672 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
24673 // number of jumps the most.
24675 if (isCMOVPseudo(MI)) {
24676 // See if we have a string of CMOVS with the same condition.
24677 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
24678 (NextMIIt->getOperand(3).getImm() == CC ||
24679 NextMIIt->getOperand(3).getImm() == OppCC)) {
24680 LastCMOV = &*NextMIIt;
24685 // This checks for case 2, but only do this if we didn't already find
24686 // case 1, as indicated by LastCMOV == MI.
24687 if (LastCMOV == &MI && NextMIIt != BB->end() &&
24688 NextMIIt->getOpcode() == MI.getOpcode() &&
24689 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
24690 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
24691 NextMIIt->getOperand(1).isKill()) {
24692 CascadedCMOV = &*NextMIIt;
24695 MachineBasicBlock *jcc1MBB = nullptr;
24697 // If we have a cascaded CMOV, we lower it to two successive branches to
24698 // the same block. EFLAGS is used by both, so mark it as live in the second.
24699 if (CascadedCMOV) {
24700 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
24701 F->insert(It, jcc1MBB);
24702 jcc1MBB->addLiveIn(X86::EFLAGS);
24705 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
24706 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
24707 F->insert(It, copy0MBB);
24708 F->insert(It, sinkMBB);
24710 // If the EFLAGS register isn't dead in the terminator, then claim that it's
24711 // live into the sink and copy blocks.
24712 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
24714 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
24715 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
24716 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
24717 copy0MBB->addLiveIn(X86::EFLAGS);
24718 sinkMBB->addLiveIn(X86::EFLAGS);
24721 // Transfer the remainder of BB and its successor edges to sinkMBB.
24722 sinkMBB->splice(sinkMBB->begin(), BB,
24723 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
24724 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
24726 // Add the true and fallthrough blocks as its successors.
24727 if (CascadedCMOV) {
24728 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
24729 BB->addSuccessor(jcc1MBB);
24731 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
24732 // jump to the sinkMBB.
24733 jcc1MBB->addSuccessor(copy0MBB);
24734 jcc1MBB->addSuccessor(sinkMBB);
24736 BB->addSuccessor(copy0MBB);
24739 // The true block target of the first (or only) branch is always sinkMBB.
24740 BB->addSuccessor(sinkMBB);
24742 // Create the conditional branch instruction.
24743 unsigned Opc = X86::GetCondBranchFromCond(CC);
24744 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
24746 if (CascadedCMOV) {
24747 unsigned Opc2 = X86::GetCondBranchFromCond(
24748 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
24749 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
24753 // %FalseValue = ...
24754 // # fallthrough to sinkMBB
24755 copy0MBB->addSuccessor(sinkMBB);
24758 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
24760 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
24761 MachineBasicBlock::iterator MIItEnd =
24762 std::next(MachineBasicBlock::iterator(LastCMOV));
24763 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
24764 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
24765 MachineInstrBuilder MIB;
24767 // As we are creating the PHIs, we have to be careful if there is more than
24768 // one. Later CMOVs may reference the results of earlier CMOVs, but later
24769 // PHIs have to reference the individual true/false inputs from earlier PHIs.
24770 // That also means that PHI construction must work forward from earlier to
24771 // later, and that the code must maintain a mapping from earlier PHI's
24772 // destination registers, and the registers that went into the PHI.
24774 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
24775 unsigned DestReg = MIIt->getOperand(0).getReg();
24776 unsigned Op1Reg = MIIt->getOperand(1).getReg();
24777 unsigned Op2Reg = MIIt->getOperand(2).getReg();
24779 // If this CMOV we are generating is the opposite condition from
24780 // the jump we generated, then we have to swap the operands for the
24781 // PHI that is going to be generated.
24782 if (MIIt->getOperand(3).getImm() == OppCC)
24783 std::swap(Op1Reg, Op2Reg);
24785 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
24786 Op1Reg = RegRewriteTable[Op1Reg].first;
24788 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
24789 Op2Reg = RegRewriteTable[Op2Reg].second;
24791 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
24792 TII->get(X86::PHI), DestReg)
24793 .addReg(Op1Reg).addMBB(copy0MBB)
24794 .addReg(Op2Reg).addMBB(thisMBB);
24796 // Add this PHI to the rewrite table.
24797 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
24800 // If we have a cascaded CMOV, the second Jcc provides the same incoming
24801 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
24802 if (CascadedCMOV) {
24803 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
24804 // Copy the PHI result to the register defined by the second CMOV.
24805 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
24806 DL, TII->get(TargetOpcode::COPY),
24807 CascadedCMOV->getOperand(0).getReg())
24808 .addReg(MI.getOperand(0).getReg());
24809 CascadedCMOV->eraseFromParent();
24812 // Now remove the CMOV(s).
24813 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
24814 (MIIt++)->eraseFromParent();
24819 MachineBasicBlock *
24820 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
24821 MachineBasicBlock *BB) const {
24822 // Combine the following atomic floating-point modification pattern:
24823 // a.store(reg OP a.load(acquire), release)
24824 // Transform them into:
24825 // OPss (%gpr), %xmm
24826 // movss %xmm, (%gpr)
24827 // Or sd equivalent for 64-bit operations.
24829 switch (MI.getOpcode()) {
24830 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
24831 case X86::RELEASE_FADD32mr:
24832 FOp = X86::ADDSSrm;
24833 MOp = X86::MOVSSmr;
24835 case X86::RELEASE_FADD64mr:
24836 FOp = X86::ADDSDrm;
24837 MOp = X86::MOVSDmr;
24840 const X86InstrInfo *TII = Subtarget.getInstrInfo();
24841 DebugLoc DL = MI.getDebugLoc();
24842 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
24843 unsigned ValOpIdx = X86::AddrNumOperands;
24844 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
24845 MachineInstrBuilder MIB =
24846 BuildMI(*BB, MI, DL, TII->get(FOp),
24847 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
24849 for (int i = 0; i < X86::AddrNumOperands; ++i) {
24850 MachineOperand &Operand = MI.getOperand(i);
24851 // Clear any kill flags on register operands as we'll create a second
24852 // instruction using the same address operands.
24853 if (Operand.isReg())
24854 Operand.setIsKill(false);
24855 MIB.addOperand(Operand);
24857 MachineInstr *FOpMI = MIB;
24858 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
24859 for (int i = 0; i < X86::AddrNumOperands; ++i)
24860 MIB.addOperand(MI.getOperand(i));
24861 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
24862 MI.eraseFromParent(); // The pseudo instruction is gone now.
24866 MachineBasicBlock *
24867 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
24868 MachineBasicBlock *BB) const {
24869 MachineFunction *MF = BB->getParent();
24870 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24871 DebugLoc DL = MI.getDebugLoc();
24872 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24874 assert(MF->shouldSplitStack());
24876 const bool Is64Bit = Subtarget.is64Bit();
24877 const bool IsLP64 = Subtarget.isTarget64BitLP64();
24879 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
24880 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
24883 // ... [Till the alloca]
24884 // If stacklet is not large enough, jump to mallocMBB
24887 // Allocate by subtracting from RSP
24888 // Jump to continueMBB
24891 // Allocate by call to runtime
24895 // [rest of original BB]
24898 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24899 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24900 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24902 MachineRegisterInfo &MRI = MF->getRegInfo();
24903 const TargetRegisterClass *AddrRegClass =
24904 getRegClassFor(getPointerTy(MF->getDataLayout()));
24906 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
24907 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
24908 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
24909 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
24910 sizeVReg = MI.getOperand(1).getReg(),
24912 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
24914 MachineFunction::iterator MBBIter = ++BB->getIterator();
24916 MF->insert(MBBIter, bumpMBB);
24917 MF->insert(MBBIter, mallocMBB);
24918 MF->insert(MBBIter, continueMBB);
24920 continueMBB->splice(continueMBB->begin(), BB,
24921 std::next(MachineBasicBlock::iterator(MI)), BB->end());
24922 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
24924 // Add code to the main basic block to check if the stack limit has been hit,
24925 // and if so, jump to mallocMBB otherwise to bumpMBB.
24926 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
24927 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
24928 .addReg(tmpSPVReg).addReg(sizeVReg);
24929 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
24930 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
24931 .addReg(SPLimitVReg);
24932 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
24934 // bumpMBB simply decreases the stack pointer, since we know the current
24935 // stacklet has enough space.
24936 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
24937 .addReg(SPLimitVReg);
24938 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
24939 .addReg(SPLimitVReg);
24940 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
24942 // Calls into a routine in libgcc to allocate more space from the heap.
24943 const uint32_t *RegMask =
24944 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
24946 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
24948 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
24949 .addExternalSymbol("__morestack_allocate_stack_space")
24950 .addRegMask(RegMask)
24951 .addReg(X86::RDI, RegState::Implicit)
24952 .addReg(X86::RAX, RegState::ImplicitDefine);
24953 } else if (Is64Bit) {
24954 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
24956 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
24957 .addExternalSymbol("__morestack_allocate_stack_space")
24958 .addRegMask(RegMask)
24959 .addReg(X86::EDI, RegState::Implicit)
24960 .addReg(X86::EAX, RegState::ImplicitDefine);
24962 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
24964 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
24965 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
24966 .addExternalSymbol("__morestack_allocate_stack_space")
24967 .addRegMask(RegMask)
24968 .addReg(X86::EAX, RegState::ImplicitDefine);
24972 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
24975 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
24976 .addReg(IsLP64 ? X86::RAX : X86::EAX);
24977 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
24979 // Set up the CFG correctly.
24980 BB->addSuccessor(bumpMBB);
24981 BB->addSuccessor(mallocMBB);
24982 mallocMBB->addSuccessor(continueMBB);
24983 bumpMBB->addSuccessor(continueMBB);
24985 // Take care of the PHI nodes.
24986 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
24987 MI.getOperand(0).getReg())
24988 .addReg(mallocPtrVReg)
24990 .addReg(bumpSPPtrVReg)
24993 // Delete the original pseudo instruction.
24994 MI.eraseFromParent();
24997 return continueMBB;
25000 MachineBasicBlock *
25001 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25002 MachineBasicBlock *BB) const {
25003 MachineFunction *MF = BB->getParent();
25004 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25005 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25006 DebugLoc DL = MI.getDebugLoc();
25008 assert(!isAsynchronousEHPersonality(
25009 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25010 "SEH does not use catchret!");
25012 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25013 if (!Subtarget.is32Bit())
25016 // C++ EH creates a new target block to hold the restore code, and wires up
25017 // the new block to the return destination with a normal JMP_4.
25018 MachineBasicBlock *RestoreMBB =
25019 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25020 assert(BB->succ_size() == 1);
25021 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25022 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25023 BB->addSuccessor(RestoreMBB);
25024 MI.getOperand(0).setMBB(RestoreMBB);
25026 auto RestoreMBBI = RestoreMBB->begin();
25027 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25028 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25032 MachineBasicBlock *
25033 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25034 MachineBasicBlock *BB) const {
25035 MachineFunction *MF = BB->getParent();
25036 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25037 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25038 // Only 32-bit SEH requires special handling for catchpad.
25039 if (IsSEH && Subtarget.is32Bit()) {
25040 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25041 DebugLoc DL = MI.getDebugLoc();
25042 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25044 MI.eraseFromParent();
25048 MachineBasicBlock *
25049 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25050 MachineBasicBlock *BB) const {
25051 // So, here we replace TLSADDR with the sequence:
25052 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25053 // We need this because TLSADDR is lowered into calls
25054 // inside MC, therefore without the two markers shrink-wrapping
25055 // may push the prologue/epilogue pass them.
25056 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25057 DebugLoc DL = MI.getDebugLoc();
25058 MachineFunction &MF = *BB->getParent();
25060 // Emit CALLSEQ_START right before the instruction.
25061 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25062 MachineInstrBuilder CallseqStart =
25063 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25064 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25066 // Emit CALLSEQ_END right after the instruction.
25067 // We don't call erase from parent because we want to keep the
25068 // original instruction around.
25069 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25070 MachineInstrBuilder CallseqEnd =
25071 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25072 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25077 MachineBasicBlock *
25078 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25079 MachineBasicBlock *BB) const {
25080 // This is pretty easy. We're taking the value that we received from
25081 // our load from the relocation, sticking it in either RDI (x86-64)
25082 // or EAX and doing an indirect call. The return value will then
25083 // be in the normal return register.
25084 MachineFunction *F = BB->getParent();
25085 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25086 DebugLoc DL = MI.getDebugLoc();
25088 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25089 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25091 // Get a register mask for the lowered call.
25092 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25093 // proper register mask.
25094 const uint32_t *RegMask =
25095 Subtarget.is64Bit() ?
25096 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25097 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25098 if (Subtarget.is64Bit()) {
25099 MachineInstrBuilder MIB =
25100 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25104 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25105 MI.getOperand(3).getTargetFlags())
25107 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25108 addDirectMem(MIB, X86::RDI);
25109 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25110 } else if (!isPositionIndependent()) {
25111 MachineInstrBuilder MIB =
25112 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25116 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25117 MI.getOperand(3).getTargetFlags())
25119 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25120 addDirectMem(MIB, X86::EAX);
25121 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25123 MachineInstrBuilder MIB =
25124 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25125 .addReg(TII->getGlobalBaseReg(F))
25128 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25129 MI.getOperand(3).getTargetFlags())
25131 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25132 addDirectMem(MIB, X86::EAX);
25133 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25136 MI.eraseFromParent(); // The pseudo instruction is gone now.
25140 MachineBasicBlock *
25141 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25142 MachineBasicBlock *MBB) const {
25143 DebugLoc DL = MI.getDebugLoc();
25144 MachineFunction *MF = MBB->getParent();
25145 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25146 MachineRegisterInfo &MRI = MF->getRegInfo();
25148 const BasicBlock *BB = MBB->getBasicBlock();
25149 MachineFunction::iterator I = ++MBB->getIterator();
25151 // Memory Reference
25152 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25153 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25156 unsigned MemOpndSlot = 0;
25158 unsigned CurOp = 0;
25160 DstReg = MI.getOperand(CurOp++).getReg();
25161 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25162 assert(RC->hasType(MVT::i32) && "Invalid destination!");
25163 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25164 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25166 MemOpndSlot = CurOp;
25168 MVT PVT = getPointerTy(MF->getDataLayout());
25169 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25170 "Invalid Pointer Size!");
25172 // For v = setjmp(buf), we generate
25175 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25176 // SjLjSetup restoreMBB
25182 // v = phi(main, restore)
25185 // if base pointer being used, load it from frame
25188 MachineBasicBlock *thisMBB = MBB;
25189 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25190 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25191 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25192 MF->insert(I, mainMBB);
25193 MF->insert(I, sinkMBB);
25194 MF->push_back(restoreMBB);
25195 restoreMBB->setHasAddressTaken();
25197 MachineInstrBuilder MIB;
25199 // Transfer the remainder of BB and its successor edges to sinkMBB.
25200 sinkMBB->splice(sinkMBB->begin(), MBB,
25201 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25202 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25205 unsigned PtrStoreOpc = 0;
25206 unsigned LabelReg = 0;
25207 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25208 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25209 !isPositionIndependent();
25211 // Prepare IP either in reg or imm.
25212 if (!UseImmLabel) {
25213 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25214 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25215 LabelReg = MRI.createVirtualRegister(PtrRC);
25216 if (Subtarget.is64Bit()) {
25217 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25221 .addMBB(restoreMBB)
25224 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25225 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25226 .addReg(XII->getGlobalBaseReg(MF))
25229 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25233 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25235 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25236 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25237 if (i == X86::AddrDisp)
25238 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25240 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
25243 MIB.addReg(LabelReg);
25245 MIB.addMBB(restoreMBB);
25246 MIB.setMemRefs(MMOBegin, MMOEnd);
25248 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
25249 .addMBB(restoreMBB);
25251 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25252 MIB.addRegMask(RegInfo->getNoPreservedMask());
25253 thisMBB->addSuccessor(mainMBB);
25254 thisMBB->addSuccessor(restoreMBB);
25258 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
25259 mainMBB->addSuccessor(sinkMBB);
25262 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
25263 TII->get(X86::PHI), DstReg)
25264 .addReg(mainDstReg).addMBB(mainMBB)
25265 .addReg(restoreDstReg).addMBB(restoreMBB);
25268 if (RegInfo->hasBasePointer(*MF)) {
25269 const bool Uses64BitFramePtr =
25270 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25271 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
25272 X86FI->setRestoreBasePointer(MF);
25273 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
25274 unsigned BasePtr = RegInfo->getBaseRegister();
25275 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
25276 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
25277 FramePtr, true, X86FI->getRestoreBasePointerOffset())
25278 .setMIFlag(MachineInstr::FrameSetup);
25280 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
25281 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25282 restoreMBB->addSuccessor(sinkMBB);
25284 MI.eraseFromParent();
25288 MachineBasicBlock *
25289 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
25290 MachineBasicBlock *MBB) const {
25291 DebugLoc DL = MI.getDebugLoc();
25292 MachineFunction *MF = MBB->getParent();
25293 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25294 MachineRegisterInfo &MRI = MF->getRegInfo();
25296 // Memory Reference
25297 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25298 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25300 MVT PVT = getPointerTy(MF->getDataLayout());
25301 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25302 "Invalid Pointer Size!");
25304 const TargetRegisterClass *RC =
25305 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25306 unsigned Tmp = MRI.createVirtualRegister(RC);
25307 // Since FP is only updated here but NOT referenced, it's treated as GPR.
25308 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25309 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
25310 unsigned SP = RegInfo->getStackRegister();
25312 MachineInstrBuilder MIB;
25314 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25315 const int64_t SPOffset = 2 * PVT.getStoreSize();
25317 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
25318 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
25321 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
25322 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
25323 MIB.addOperand(MI.getOperand(i));
25324 MIB.setMemRefs(MMOBegin, MMOEnd);
25326 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
25327 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25328 if (i == X86::AddrDisp)
25329 MIB.addDisp(MI.getOperand(i), LabelOffset);
25331 MIB.addOperand(MI.getOperand(i));
25333 MIB.setMemRefs(MMOBegin, MMOEnd);
25335 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
25336 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25337 if (i == X86::AddrDisp)
25338 MIB.addDisp(MI.getOperand(i), SPOffset);
25340 MIB.addOperand(MI.getOperand(i));
25342 MIB.setMemRefs(MMOBegin, MMOEnd);
25344 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
25346 MI.eraseFromParent();
25350 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
25351 MachineBasicBlock *MBB,
25352 MachineBasicBlock *DispatchBB,
25354 DebugLoc DL = MI.getDebugLoc();
25355 MachineFunction *MF = MBB->getParent();
25356 MachineRegisterInfo *MRI = &MF->getRegInfo();
25357 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25359 MVT PVT = getPointerTy(MF->getDataLayout());
25360 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
25365 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25366 !isPositionIndependent();
25369 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25371 const TargetRegisterClass *TRC =
25372 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25373 VR = MRI->createVirtualRegister(TRC);
25374 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25376 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
25378 if (Subtarget.is64Bit())
25379 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
25383 .addMBB(DispatchBB)
25386 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
25387 .addReg(0) /* XII->getGlobalBaseReg(MF) */
25390 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
25394 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
25395 addFrameReference(MIB, FI, 36);
25397 MIB.addMBB(DispatchBB);
25402 MachineBasicBlock *
25403 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
25404 MachineBasicBlock *BB) const {
25405 DebugLoc DL = MI.getDebugLoc();
25406 MachineFunction *MF = BB->getParent();
25407 MachineFrameInfo &MFI = MF->getFrameInfo();
25408 MachineRegisterInfo *MRI = &MF->getRegInfo();
25409 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25410 int FI = MFI.getFunctionContextIndex();
25412 // Get a mapping of the call site numbers to all of the landing pads they're
25413 // associated with.
25414 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
25415 unsigned MaxCSNum = 0;
25416 for (auto &MBB : *MF) {
25417 if (!MBB.isEHPad())
25420 MCSymbol *Sym = nullptr;
25421 for (const auto &MI : MBB) {
25422 if (MI.isDebugValue())
25425 assert(MI.isEHLabel() && "expected EH_LABEL");
25426 Sym = MI.getOperand(0).getMCSymbol();
25430 if (!MF->hasCallSiteLandingPad(Sym))
25433 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
25434 CallSiteNumToLPad[CSI].push_back(&MBB);
25435 MaxCSNum = std::max(MaxCSNum, CSI);
25439 // Get an ordered list of the machine basic blocks for the jump table.
25440 std::vector<MachineBasicBlock *> LPadList;
25441 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
25442 LPadList.reserve(CallSiteNumToLPad.size());
25444 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
25445 for (auto &LP : CallSiteNumToLPad[CSI]) {
25446 LPadList.push_back(LP);
25447 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
25451 assert(!LPadList.empty() &&
25452 "No landing pad destinations for the dispatch jump table!");
25454 // Create the MBBs for the dispatch code.
25456 // Shove the dispatch's address into the return slot in the function context.
25457 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
25458 DispatchBB->setIsEHPad(true);
25460 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
25461 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
25462 DispatchBB->addSuccessor(TrapBB);
25464 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
25465 DispatchBB->addSuccessor(DispContBB);
25468 MF->push_back(DispatchBB);
25469 MF->push_back(DispContBB);
25470 MF->push_back(TrapBB);
25472 // Insert code into the entry block that creates and registers the function
25474 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
25476 // Create the jump table and associated information
25477 MachineJumpTableInfo *JTI =
25478 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
25479 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
25481 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
25482 const X86RegisterInfo &RI = XII->getRegisterInfo();
25484 // Add a register mask with no preserved registers. This results in all
25485 // registers being marked as clobbered.
25486 if (RI.hasBasePointer(*MF)) {
25487 const bool FPIs64Bit =
25488 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25489 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
25490 MFI->setRestoreBasePointer(MF);
25492 unsigned FP = RI.getFrameRegister(*MF);
25493 unsigned BP = RI.getBaseRegister();
25494 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
25495 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
25496 MFI->getRestoreBasePointerOffset())
25497 .addRegMask(RI.getNoPreservedMask());
25499 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
25500 .addRegMask(RI.getNoPreservedMask());
25503 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25504 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
25506 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
25508 .addImm(LPadList.size());
25509 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
25511 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25512 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
25515 BuildMI(DispContBB, DL,
25516 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
25518 .addImm(Subtarget.is64Bit() ? 8 : 4)
25520 .addJumpTableIndex(MJTI)
25523 // Add the jump table entries as successors to the MBB.
25524 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
25525 for (auto &LP : LPadList)
25526 if (SeenMBBs.insert(LP).second)
25527 DispContBB->addSuccessor(LP);
25529 // N.B. the order the invoke BBs are processed in doesn't matter here.
25530 SmallVector<MachineBasicBlock *, 64> MBBLPads;
25531 const MCPhysReg *SavedRegs =
25532 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
25533 for (MachineBasicBlock *MBB : InvokeBBs) {
25534 // Remove the landing pad successor from the invoke block and replace it
25535 // with the new dispatch block.
25536 // Keep a copy of Successors since it's modified inside the loop.
25537 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
25539 // FIXME: Avoid quadratic complexity.
25540 for (auto MBBS : Successors) {
25541 if (MBBS->isEHPad()) {
25542 MBB->removeSuccessor(MBBS);
25543 MBBLPads.push_back(MBBS);
25547 MBB->addSuccessor(DispatchBB);
25549 // Find the invoke call and mark all of the callee-saved registers as
25550 // 'implicit defined' so that they're spilled. This prevents code from
25551 // moving instructions to before the EH block, where they will never be
25553 for (auto &II : reverse(*MBB)) {
25557 DenseMap<unsigned, bool> DefRegs;
25558 for (auto &MOp : II.operands())
25560 DefRegs[MOp.getReg()] = true;
25562 MachineInstrBuilder MIB(*MF, &II);
25563 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
25564 unsigned Reg = SavedRegs[RI];
25566 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
25573 // Mark all former landing pads as non-landing pads. The dispatch is the only
25574 // landing pad now.
25575 for (auto &LP : MBBLPads)
25576 LP->setIsEHPad(false);
25578 // The instruction is gone now.
25579 MI.eraseFromParent();
25583 MachineBasicBlock *
25584 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
25585 MachineBasicBlock *BB) const {
25586 MachineFunction *MF = BB->getParent();
25587 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25588 DebugLoc DL = MI.getDebugLoc();
25590 switch (MI.getOpcode()) {
25591 default: llvm_unreachable("Unexpected instr type to insert");
25592 case X86::TAILJMPd64:
25593 case X86::TAILJMPr64:
25594 case X86::TAILJMPm64:
25595 case X86::TAILJMPr64_REX:
25596 case X86::TAILJMPm64_REX:
25597 llvm_unreachable("TAILJMP64 would not be touched here.");
25598 case X86::TCRETURNdi64:
25599 case X86::TCRETURNri64:
25600 case X86::TCRETURNmi64:
25602 case X86::TLS_addr32:
25603 case X86::TLS_addr64:
25604 case X86::TLS_base_addr32:
25605 case X86::TLS_base_addr64:
25606 return EmitLoweredTLSAddr(MI, BB);
25607 case X86::CATCHRET:
25608 return EmitLoweredCatchRet(MI, BB);
25609 case X86::CATCHPAD:
25610 return EmitLoweredCatchPad(MI, BB);
25611 case X86::SEG_ALLOCA_32:
25612 case X86::SEG_ALLOCA_64:
25613 return EmitLoweredSegAlloca(MI, BB);
25614 case X86::TLSCall_32:
25615 case X86::TLSCall_64:
25616 return EmitLoweredTLSCall(MI, BB);
25617 case X86::CMOV_FR32:
25618 case X86::CMOV_FR64:
25619 case X86::CMOV_FR128:
25620 case X86::CMOV_GR8:
25621 case X86::CMOV_GR16:
25622 case X86::CMOV_GR32:
25623 case X86::CMOV_RFP32:
25624 case X86::CMOV_RFP64:
25625 case X86::CMOV_RFP80:
25626 case X86::CMOV_V2F64:
25627 case X86::CMOV_V2I64:
25628 case X86::CMOV_V4F32:
25629 case X86::CMOV_V4F64:
25630 case X86::CMOV_V4I64:
25631 case X86::CMOV_V16F32:
25632 case X86::CMOV_V8F32:
25633 case X86::CMOV_V8F64:
25634 case X86::CMOV_V8I64:
25635 case X86::CMOV_V8I1:
25636 case X86::CMOV_V16I1:
25637 case X86::CMOV_V32I1:
25638 case X86::CMOV_V64I1:
25639 return EmitLoweredSelect(MI, BB);
25641 case X86::RDFLAGS32:
25642 case X86::RDFLAGS64: {
25644 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
25645 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
25646 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
25647 // Permit reads of the FLAGS register without it being defined.
25648 // This intrinsic exists to read external processor state in flags, such as
25649 // the trap flag, interrupt flag, and direction flag, none of which are
25650 // modeled by the backend.
25651 Push->getOperand(2).setIsUndef();
25652 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
25654 MI.eraseFromParent(); // The pseudo is gone now.
25658 case X86::WRFLAGS32:
25659 case X86::WRFLAGS64: {
25661 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
25663 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
25664 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
25665 BuildMI(*BB, MI, DL, TII->get(PopF));
25667 MI.eraseFromParent(); // The pseudo is gone now.
25671 case X86::RELEASE_FADD32mr:
25672 case X86::RELEASE_FADD64mr:
25673 return EmitLoweredAtomicFP(MI, BB);
25675 case X86::FP32_TO_INT16_IN_MEM:
25676 case X86::FP32_TO_INT32_IN_MEM:
25677 case X86::FP32_TO_INT64_IN_MEM:
25678 case X86::FP64_TO_INT16_IN_MEM:
25679 case X86::FP64_TO_INT32_IN_MEM:
25680 case X86::FP64_TO_INT64_IN_MEM:
25681 case X86::FP80_TO_INT16_IN_MEM:
25682 case X86::FP80_TO_INT32_IN_MEM:
25683 case X86::FP80_TO_INT64_IN_MEM: {
25684 // Change the floating point control register to use "round towards zero"
25685 // mode when truncating to an integer value.
25686 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
25687 addFrameReference(BuildMI(*BB, MI, DL,
25688 TII->get(X86::FNSTCW16m)), CWFrameIdx);
25690 // Load the old value of the high byte of the control word...
25692 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
25693 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
25696 // Set the high part to be round to zero...
25697 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
25700 // Reload the modified control word now...
25701 addFrameReference(BuildMI(*BB, MI, DL,
25702 TII->get(X86::FLDCW16m)), CWFrameIdx);
25704 // Restore the memory image of control word to original value
25705 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
25708 // Get the X86 opcode to use.
25710 switch (MI.getOpcode()) {
25711 default: llvm_unreachable("illegal opcode!");
25712 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
25713 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
25714 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
25715 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
25716 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
25717 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
25718 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
25719 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
25720 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
25723 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25724 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
25725 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
25727 // Reload the original control word now.
25728 addFrameReference(BuildMI(*BB, MI, DL,
25729 TII->get(X86::FLDCW16m)), CWFrameIdx);
25731 MI.eraseFromParent(); // The pseudo instruction is gone now.
25734 // String/text processing lowering.
25735 case X86::PCMPISTRM128REG:
25736 case X86::VPCMPISTRM128REG:
25737 case X86::PCMPISTRM128MEM:
25738 case X86::VPCMPISTRM128MEM:
25739 case X86::PCMPESTRM128REG:
25740 case X86::VPCMPESTRM128REG:
25741 case X86::PCMPESTRM128MEM:
25742 case X86::VPCMPESTRM128MEM:
25743 assert(Subtarget.hasSSE42() &&
25744 "Target must have SSE4.2 or AVX features enabled");
25745 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
25747 // String/text processing lowering.
25748 case X86::PCMPISTRIREG:
25749 case X86::VPCMPISTRIREG:
25750 case X86::PCMPISTRIMEM:
25751 case X86::VPCMPISTRIMEM:
25752 case X86::PCMPESTRIREG:
25753 case X86::VPCMPESTRIREG:
25754 case X86::PCMPESTRIMEM:
25755 case X86::VPCMPESTRIMEM:
25756 assert(Subtarget.hasSSE42() &&
25757 "Target must have SSE4.2 or AVX features enabled");
25758 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
25760 // Thread synchronization.
25762 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
25763 case X86::MONITORX:
25764 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
25767 return emitWRPKRU(MI, BB, Subtarget);
25769 return emitRDPKRU(MI, BB, Subtarget);
25772 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
25774 case X86::VASTART_SAVE_XMM_REGS:
25775 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
25777 case X86::VAARG_64:
25778 return EmitVAARG64WithCustomInserter(MI, BB);
25780 case X86::EH_SjLj_SetJmp32:
25781 case X86::EH_SjLj_SetJmp64:
25782 return emitEHSjLjSetJmp(MI, BB);
25784 case X86::EH_SjLj_LongJmp32:
25785 case X86::EH_SjLj_LongJmp64:
25786 return emitEHSjLjLongJmp(MI, BB);
25788 case X86::Int_eh_sjlj_setup_dispatch:
25789 return EmitSjLjDispatchBlock(MI, BB);
25791 case TargetOpcode::STATEPOINT:
25792 // As an implementation detail, STATEPOINT shares the STACKMAP format at
25793 // this point in the process. We diverge later.
25794 return emitPatchPoint(MI, BB);
25796 case TargetOpcode::STACKMAP:
25797 case TargetOpcode::PATCHPOINT:
25798 return emitPatchPoint(MI, BB);
25800 case X86::LCMPXCHG8B: {
25801 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25802 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
25803 // requires a memory operand. If it happens that current architecture is
25804 // i686 and for current function we need a base pointer
25805 // - which is ESI for i686 - register allocator would not be able to
25806 // allocate registers for an address in form of X(%reg, %reg, Y)
25807 // - there never would be enough unreserved registers during regalloc
25808 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
25809 // We are giving a hand to register allocator by precomputing the address in
25810 // a new vreg using LEA.
25812 // If it is not i686 or there is no base pointer - nothing to do here.
25813 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
25816 // Even though this code does not necessarily needs the base pointer to
25817 // be ESI, we check for that. The reason: if this assert fails, there are
25818 // some changes happened in the compiler base pointer handling, which most
25819 // probably have to be addressed somehow here.
25820 assert(TRI->getBaseRegister() == X86::ESI &&
25821 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
25822 "base pointer in mind");
25824 MachineRegisterInfo &MRI = MF->getRegInfo();
25825 MVT SPTy = getPointerTy(MF->getDataLayout());
25826 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25827 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
25829 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25830 // Regalloc does not need any help when the memory operand of CMPXCHG8B
25831 // does not use index register.
25832 if (AM.IndexReg == X86::NoRegister)
25835 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
25836 // four operand definitions that are E[ABCD] registers. We skip them and
25837 // then insert the LEA.
25838 MachineBasicBlock::iterator MBBI(MI);
25839 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
25840 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
25843 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
25845 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
25849 case X86::LCMPXCHG16B:
25851 case X86::LCMPXCHG8B_SAVE_EBX:
25852 case X86::LCMPXCHG16B_SAVE_RBX: {
25854 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
25855 if (!BB->isLiveIn(BasePtr))
25856 BB->addLiveIn(BasePtr);
25862 //===----------------------------------------------------------------------===//
25863 // X86 Optimization Hooks
25864 //===----------------------------------------------------------------------===//
25866 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
25869 const SelectionDAG &DAG,
25870 unsigned Depth) const {
25871 unsigned BitWidth = KnownZero.getBitWidth();
25872 unsigned Opc = Op.getOpcode();
25873 assert((Opc >= ISD::BUILTIN_OP_END ||
25874 Opc == ISD::INTRINSIC_WO_CHAIN ||
25875 Opc == ISD::INTRINSIC_W_CHAIN ||
25876 Opc == ISD::INTRINSIC_VOID) &&
25877 "Should use MaskedValueIsZero if you don't know whether Op"
25878 " is a target node!");
25880 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
25894 // These nodes' second result is a boolean.
25895 if (Op.getResNo() == 0)
25898 case X86ISD::SETCC:
25899 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
25901 case X86ISD::MOVMSK: {
25902 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
25903 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
25906 case X86ISD::VZEXT: {
25907 SDValue N0 = Op.getOperand(0);
25908 unsigned NumElts = Op.getValueType().getVectorNumElements();
25909 unsigned InNumElts = N0.getValueType().getVectorNumElements();
25910 unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
25912 KnownZero = KnownOne = APInt(InBitWidth, 0);
25913 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25914 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
25915 KnownOne = KnownOne.zext(BitWidth);
25916 KnownZero = KnownZero.zext(BitWidth);
25917 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
25923 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
25924 SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
25925 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
25926 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
25927 return Op.getScalarValueSizeInBits();
25929 if (Op.getOpcode() == X86ISD::VSEXT) {
25930 EVT VT = Op.getValueType();
25931 EVT SrcVT = Op.getOperand(0).getValueType();
25932 unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
25933 Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
25941 /// Returns true (and the GlobalValue and the offset) if the node is a
25942 /// GlobalAddress + offset.
25943 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
25944 const GlobalValue* &GA,
25945 int64_t &Offset) const {
25946 if (N->getOpcode() == X86ISD::Wrapper) {
25947 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
25948 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
25949 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
25953 return TargetLowering::isGAPlusOffset(N, GA, Offset);
25956 // Attempt to match a combined shuffle mask against supported unary shuffle
25958 // TODO: Investigate sharing more of this with shuffle lowering.
25959 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
25961 const X86Subtarget &Subtarget,
25962 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
25963 unsigned NumMaskElts = Mask.size();
25964 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
25966 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
25967 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
25968 isUndefOrEqual(Mask[0], 0) &&
25969 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
25970 Shuffle = X86ISD::VZEXT_MOVL;
25971 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
25975 // Match against a VZEXT instruction.
25976 // TODO: Add 256/512-bit vector support.
25977 if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
25978 unsigned MaxScale = 64 / MaskEltSize;
25979 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
25981 unsigned NumDstElts = NumMaskElts / Scale;
25982 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
25983 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
25984 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
25988 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
25989 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
25990 Shuffle = X86ISD::VZEXT;
25996 // Check if we have SSE3 which will let us use MOVDDUP etc. The
25997 // instructions are no slower than UNPCKLPD but has the option to
25998 // fold the input operand into even an unaligned memory load.
25999 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
26000 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26001 Shuffle = X86ISD::MOVDDUP;
26002 SrcVT = DstVT = MVT::v2f64;
26005 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26006 Shuffle = X86ISD::MOVSLDUP;
26007 SrcVT = DstVT = MVT::v4f32;
26010 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26011 Shuffle = X86ISD::MOVSHDUP;
26012 SrcVT = DstVT = MVT::v4f32;
26017 if (MaskVT.is256BitVector() && FloatDomain) {
26018 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26019 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26020 Shuffle = X86ISD::MOVDDUP;
26021 SrcVT = DstVT = MVT::v4f64;
26024 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26025 Shuffle = X86ISD::MOVSLDUP;
26026 SrcVT = DstVT = MVT::v8f32;
26029 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26030 Shuffle = X86ISD::MOVSHDUP;
26031 SrcVT = DstVT = MVT::v8f32;
26036 if (MaskVT.is512BitVector() && FloatDomain) {
26037 assert(Subtarget.hasAVX512() &&
26038 "AVX512 required for 512-bit vector shuffles");
26039 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26040 Shuffle = X86ISD::MOVDDUP;
26041 SrcVT = DstVT = MVT::v8f64;
26044 if (isTargetShuffleEquivalent(
26045 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26046 Shuffle = X86ISD::MOVSLDUP;
26047 SrcVT = DstVT = MVT::v16f32;
26050 if (isTargetShuffleEquivalent(
26051 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26052 Shuffle = X86ISD::MOVSHDUP;
26053 SrcVT = DstVT = MVT::v16f32;
26058 // Attempt to match against broadcast-from-vector.
26059 if (Subtarget.hasAVX2()) {
26060 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26061 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26062 SrcVT = DstVT = MaskVT;
26063 Shuffle = X86ISD::VBROADCAST;
26071 // Attempt to match a combined shuffle mask against supported unary immediate
26072 // permute instructions.
26073 // TODO: Investigate sharing more of this with shuffle lowering.
26074 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26076 const X86Subtarget &Subtarget,
26077 unsigned &Shuffle, MVT &ShuffleVT,
26078 unsigned &PermuteImm) {
26079 unsigned NumMaskElts = Mask.size();
26081 bool ContainsZeros = false;
26082 SmallBitVector Zeroable(NumMaskElts, false);
26083 for (unsigned i = 0; i != NumMaskElts; ++i) {
26085 Zeroable[i] = isUndefOrZero(M);
26086 ContainsZeros |= (M == SM_SentinelZero);
26089 // Attempt to match against byte/bit shifts.
26090 // FIXME: Add 512-bit support.
26091 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26092 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26093 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26094 MaskVT.getScalarSizeInBits(), Mask,
26095 0, Zeroable, Subtarget);
26096 if (0 < ShiftAmt) {
26097 PermuteImm = (unsigned)ShiftAmt;
26102 // Ensure we don't contain any zero elements.
26106 assert(llvm::all_of(Mask, [&](int M) {
26107 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26108 }) && "Expected unary shuffle");
26110 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26111 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26112 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26114 // Handle PSHUFLW/PSHUFHW repeated patterns.
26115 if (MaskScalarSizeInBits == 16) {
26116 SmallVector<int, 4> RepeatedMask;
26117 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26118 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26119 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26121 // PSHUFLW: permute lower 4 elements only.
26122 if (isUndefOrInRange(LoMask, 0, 4) &&
26123 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26124 Shuffle = X86ISD::PSHUFLW;
26125 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26126 PermuteImm = getV4X86ShuffleImm(LoMask);
26130 // PSHUFHW: permute upper 4 elements only.
26131 if (isUndefOrInRange(HiMask, 4, 8) &&
26132 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26133 // Offset the HiMask so that we can create the shuffle immediate.
26134 int OffsetHiMask[4];
26135 for (int i = 0; i != 4; ++i)
26136 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26138 Shuffle = X86ISD::PSHUFHW;
26139 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26140 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26149 // We only support permutation of 32/64 bit elements after this.
26150 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26153 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26154 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26155 if (FloatDomain && !Subtarget.hasAVX())
26158 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26159 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
26160 FloatDomain = true;
26162 // Check for lane crossing permutes.
26163 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26164 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26165 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26166 Shuffle = X86ISD::VPERMI;
26167 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26168 PermuteImm = getV4X86ShuffleImm(Mask);
26171 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26172 SmallVector<int, 4> RepeatedMask;
26173 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26174 Shuffle = X86ISD::VPERMI;
26175 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
26176 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26183 // VPERMILPD can permute with a non-repeating shuffle.
26184 if (FloatDomain && MaskScalarSizeInBits == 64) {
26185 Shuffle = X86ISD::VPERMILPI;
26186 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
26188 for (int i = 0, e = Mask.size(); i != e; ++i) {
26190 if (M == SM_SentinelUndef)
26192 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
26193 PermuteImm |= (M & 1) << i;
26198 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
26199 SmallVector<int, 4> RepeatedMask;
26200 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
26203 // Narrow the repeated mask for 32-bit element permutes.
26204 SmallVector<int, 4> WordMask = RepeatedMask;
26205 if (MaskScalarSizeInBits == 64)
26206 scaleShuffleMask(2, RepeatedMask, WordMask);
26208 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
26209 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
26210 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
26211 PermuteImm = getV4X86ShuffleImm(WordMask);
26215 // Attempt to match a combined unary shuffle mask against supported binary
26216 // shuffle instructions.
26217 // TODO: Investigate sharing more of this with shuffle lowering.
26218 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26219 bool FloatDomain, SDValue &V1, SDValue &V2,
26220 const X86Subtarget &Subtarget,
26221 unsigned &Shuffle, MVT &ShuffleVT,
26223 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
26225 if (MaskVT.is128BitVector()) {
26226 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
26228 Shuffle = X86ISD::MOVLHPS;
26229 ShuffleVT = MVT::v4f32;
26232 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
26234 Shuffle = X86ISD::MOVHLPS;
26235 ShuffleVT = MVT::v4f32;
26238 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
26239 (FloatDomain || !Subtarget.hasSSE41())) {
26241 Shuffle = X86ISD::MOVSD;
26242 ShuffleVT = MaskVT;
26245 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
26246 (FloatDomain || !Subtarget.hasSSE41())) {
26247 Shuffle = X86ISD::MOVSS;
26248 ShuffleVT = MaskVT;
26253 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
26254 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26255 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26256 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
26257 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
26258 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
26259 MVT LegalVT = MaskVT;
26260 if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
26261 LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
26263 SmallVector<int, 64> Unpckl, Unpckh;
26265 createUnpackShuffleMask(MaskVT, Unpckl, true, true);
26266 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26268 Shuffle = X86ISD::UNPCKL;
26269 ShuffleVT = LegalVT;
26273 createUnpackShuffleMask(MaskVT, Unpckh, false, true);
26274 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26276 Shuffle = X86ISD::UNPCKH;
26277 ShuffleVT = LegalVT;
26281 createUnpackShuffleMask(MaskVT, Unpckl, true, false);
26282 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26283 Shuffle = X86ISD::UNPCKL;
26284 ShuffleVT = LegalVT;
26288 createUnpackShuffleMask(MaskVT, Unpckh, false, false);
26289 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26290 Shuffle = X86ISD::UNPCKH;
26291 ShuffleVT = LegalVT;
26295 ShuffleVectorSDNode::commuteMask(Unpckl);
26296 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26298 Shuffle = X86ISD::UNPCKL;
26299 ShuffleVT = LegalVT;
26303 ShuffleVectorSDNode::commuteMask(Unpckh);
26304 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26306 Shuffle = X86ISD::UNPCKH;
26307 ShuffleVT = LegalVT;
26316 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26318 SDValue &V1, SDValue &V2,
26319 SDLoc &DL, SelectionDAG &DAG,
26320 const X86Subtarget &Subtarget,
26321 unsigned &Shuffle, MVT &ShuffleVT,
26322 unsigned &PermuteImm) {
26323 unsigned NumMaskElts = Mask.size();
26325 // Attempt to match against PALIGNR byte rotate.
26326 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26327 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26328 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
26329 if (0 < ByteRotation) {
26330 Shuffle = X86ISD::PALIGNR;
26331 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
26332 PermuteImm = ByteRotation;
26337 // Attempt to combine to X86ISD::BLENDI.
26338 if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
26339 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
26340 // Determine a type compatible with X86ISD::BLENDI.
26341 // TODO - add 16i16 support (requires lane duplication).
26342 MVT BlendVT = MaskVT;
26343 if (Subtarget.hasAVX2()) {
26344 if (BlendVT == MVT::v4i64)
26345 BlendVT = MVT::v8i32;
26346 else if (BlendVT == MVT::v2i64)
26347 BlendVT = MVT::v4i32;
26349 if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
26350 BlendVT = MVT::v8i16;
26351 else if (BlendVT == MVT::v4i64)
26352 BlendVT = MVT::v4f64;
26353 else if (BlendVT == MVT::v8i32)
26354 BlendVT = MVT::v8f32;
26357 unsigned BlendSize = BlendVT.getVectorNumElements();
26358 unsigned MaskRatio = BlendSize / NumMaskElts;
26360 // Can we blend with zero?
26361 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
26363 NumMaskElts <= BlendVT.getVectorNumElements()) {
26365 for (unsigned i = 0; i != BlendSize; ++i)
26366 if (Mask[i / MaskRatio] < 0)
26367 PermuteImm |= 1u << i;
26369 V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
26370 Shuffle = X86ISD::BLENDI;
26371 ShuffleVT = BlendVT;
26375 // Attempt to match as a binary blend.
26376 if (NumMaskElts <= BlendVT.getVectorNumElements()) {
26377 bool MatchBlend = true;
26378 for (int i = 0; i != (int)NumMaskElts; ++i) {
26380 if (M == SM_SentinelUndef)
26382 else if (M == SM_SentinelZero)
26383 MatchBlend = false;
26384 else if ((M != i) && (M != (i + (int)NumMaskElts)))
26385 MatchBlend = false;
26390 for (unsigned i = 0; i != BlendSize; ++i)
26391 if ((int)NumMaskElts <= Mask[i / MaskRatio])
26392 PermuteImm |= 1u << i;
26394 Shuffle = X86ISD::BLENDI;
26395 ShuffleVT = BlendVT;
26401 // Attempt to combine to INSERTPS.
26402 if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
26403 SmallBitVector Zeroable(4, false);
26404 for (unsigned i = 0; i != NumMaskElts; ++i)
26406 Zeroable[i] = true;
26408 if (Zeroable.any() &&
26409 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
26410 Shuffle = X86ISD::INSERTPS;
26411 ShuffleVT = MVT::v4f32;
26416 // Attempt to combine to SHUFPD.
26417 if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
26418 (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
26419 (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
26420 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
26421 Shuffle = X86ISD::SHUFP;
26422 ShuffleVT = MaskVT;
26427 // Attempt to combine to SHUFPS.
26428 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26429 (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26430 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
26431 SmallVector<int, 4> RepeatedMask;
26432 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
26433 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
26434 int M0 = RepeatedMask[Offset];
26435 int M1 = RepeatedMask[Offset + 1];
26437 if (isUndefInRange(RepeatedMask, Offset, 2)) {
26438 return DAG.getUNDEF(MaskVT);
26439 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
26440 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
26441 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
26442 return getZeroVector(MaskVT, Subtarget, DAG, DL);
26443 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
26444 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26445 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26447 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
26448 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26449 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26456 int ShufMask[4] = {-1, -1, -1, -1};
26457 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
26458 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
26463 Shuffle = X86ISD::SHUFP;
26464 ShuffleVT = MaskVT;
26465 PermuteImm = getV4X86ShuffleImm(ShufMask);
26474 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
26477 /// This is the leaf of the recursive combine below. When we have found some
26478 /// chain of single-use x86 shuffle instructions and accumulated the combined
26479 /// shuffle mask represented by them, this will try to pattern match that mask
26480 /// into either a single instruction if there is a special purpose instruction
26481 /// for this operation, or into a PSHUFB instruction which is a fully general
26482 /// instruction but should only be used to replace chains over a certain depth.
26483 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
26484 ArrayRef<int> BaseMask, int Depth,
26485 bool HasVariableMask, SelectionDAG &DAG,
26486 TargetLowering::DAGCombinerInfo &DCI,
26487 const X86Subtarget &Subtarget) {
26488 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
26489 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
26490 "Unexpected number of shuffle inputs!");
26492 // Find the inputs that enter the chain. Note that multiple uses are OK
26493 // here, we're not going to remove the operands we find.
26494 bool UnaryShuffle = (Inputs.size() == 1);
26495 SDValue V1 = peekThroughBitcasts(Inputs[0]);
26496 SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
26498 MVT VT1 = V1.getSimpleValueType();
26499 MVT VT2 = V2.getSimpleValueType();
26500 MVT RootVT = Root.getSimpleValueType();
26501 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
26502 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
26503 "Vector size mismatch");
26508 unsigned NumBaseMaskElts = BaseMask.size();
26509 if (NumBaseMaskElts == 1) {
26510 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
26511 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26516 unsigned RootSizeInBits = RootVT.getSizeInBits();
26517 unsigned NumRootElts = RootVT.getVectorNumElements();
26518 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
26519 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
26520 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
26522 // Don't combine if we are a AVX512/EVEX target and the mask element size
26523 // is different from the root element size - this would prevent writemasks
26524 // from being reused.
26525 // TODO - this currently prevents all lane shuffles from occurring.
26526 // TODO - check for writemasks usage instead of always preventing combining.
26527 // TODO - attempt to narrow Mask back to writemask size.
26528 bool IsEVEXShuffle =
26529 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
26530 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
26533 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
26535 // Handle 128-bit lane shuffles of 256-bit vectors.
26536 // TODO - this should support binary shuffles.
26537 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
26538 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
26539 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
26540 return false; // Nothing to do!
26541 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26542 unsigned PermMask = 0;
26543 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
26544 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
26546 Res = DAG.getBitcast(ShuffleVT, V1);
26547 DCI.AddToWorklist(Res.getNode());
26548 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
26549 DAG.getUNDEF(ShuffleVT),
26550 DAG.getConstant(PermMask, DL, MVT::i8));
26551 DCI.AddToWorklist(Res.getNode());
26552 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26557 // For masks that have been widened to 128-bit elements or more,
26558 // narrow back down to 64-bit elements.
26559 SmallVector<int, 64> Mask;
26560 if (BaseMaskEltSizeInBits > 64) {
26561 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
26562 int MaskScale = BaseMaskEltSizeInBits / 64;
26563 scaleShuffleMask(MaskScale, BaseMask, Mask);
26565 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
26568 unsigned NumMaskElts = Mask.size();
26569 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
26571 // Determine the effective mask value type.
26572 FloatDomain &= (32 <= MaskEltSizeInBits);
26573 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
26574 : MVT::getIntegerVT(MaskEltSizeInBits);
26575 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
26577 // Only allow legal mask types.
26578 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
26581 // Attempt to match the mask against known shuffle patterns.
26582 MVT ShuffleSrcVT, ShuffleVT;
26583 unsigned Shuffle, PermuteImm;
26585 if (UnaryShuffle) {
26586 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
26587 // directly if we don't shuffle the lower element and we shuffle the upper
26588 // (zero) elements within themselves.
26589 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
26590 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
26591 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
26592 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
26593 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
26594 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
26595 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26601 if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
26602 ShuffleSrcVT, ShuffleVT)) {
26603 if (Depth == 1 && Root.getOpcode() == Shuffle)
26604 return false; // Nothing to do!
26605 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26606 return false; // AVX512 Writemask clash.
26607 Res = DAG.getBitcast(ShuffleSrcVT, V1);
26608 DCI.AddToWorklist(Res.getNode());
26609 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
26610 DCI.AddToWorklist(Res.getNode());
26611 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26616 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
26617 Shuffle, ShuffleVT, PermuteImm)) {
26618 if (Depth == 1 && Root.getOpcode() == Shuffle)
26619 return false; // Nothing to do!
26620 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26621 return false; // AVX512 Writemask clash.
26622 Res = DAG.getBitcast(ShuffleVT, V1);
26623 DCI.AddToWorklist(Res.getNode());
26624 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
26625 DAG.getConstant(PermuteImm, DL, MVT::i8));
26626 DCI.AddToWorklist(Res.getNode());
26627 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26633 if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
26634 Shuffle, ShuffleVT, UnaryShuffle)) {
26635 if (Depth == 1 && Root.getOpcode() == Shuffle)
26636 return false; // Nothing to do!
26637 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26638 return false; // AVX512 Writemask clash.
26639 V1 = DAG.getBitcast(ShuffleVT, V1);
26640 DCI.AddToWorklist(V1.getNode());
26641 V2 = DAG.getBitcast(ShuffleVT, V2);
26642 DCI.AddToWorklist(V2.getNode());
26643 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
26644 DCI.AddToWorklist(Res.getNode());
26645 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26650 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
26651 DAG, Subtarget, Shuffle, ShuffleVT,
26653 if (Depth == 1 && Root.getOpcode() == Shuffle)
26654 return false; // Nothing to do!
26655 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26656 return false; // AVX512 Writemask clash.
26657 V1 = DAG.getBitcast(ShuffleVT, V1);
26658 DCI.AddToWorklist(V1.getNode());
26659 V2 = DAG.getBitcast(ShuffleVT, V2);
26660 DCI.AddToWorklist(V2.getNode());
26661 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
26662 DAG.getConstant(PermuteImm, DL, MVT::i8));
26663 DCI.AddToWorklist(Res.getNode());
26664 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26669 // Don't try to re-form single instruction chains under any circumstances now
26670 // that we've done encoding canonicalization for them.
26674 bool MaskContainsZeros =
26675 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
26677 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
26678 // If we have a single input lane-crossing shuffle then lower to VPERMV.
26679 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26680 ((Subtarget.hasAVX2() &&
26681 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26682 (Subtarget.hasAVX512() &&
26683 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26684 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26685 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26686 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26687 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26688 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26689 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26690 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26691 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26692 DCI.AddToWorklist(VPermMask.getNode());
26693 Res = DAG.getBitcast(MaskVT, V1);
26694 DCI.AddToWorklist(Res.getNode());
26695 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
26696 DCI.AddToWorklist(Res.getNode());
26697 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26702 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
26703 // vector as the second source.
26704 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26705 ((Subtarget.hasAVX512() &&
26706 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26707 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26708 (Subtarget.hasVLX() &&
26709 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26710 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26711 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26712 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26713 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26714 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26715 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
26716 for (unsigned i = 0; i != NumMaskElts; ++i)
26717 if (Mask[i] == SM_SentinelZero)
26718 Mask[i] = NumMaskElts + i;
26720 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26721 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26722 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26723 DCI.AddToWorklist(VPermMask.getNode());
26724 Res = DAG.getBitcast(MaskVT, V1);
26725 DCI.AddToWorklist(Res.getNode());
26726 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
26727 DCI.AddToWorklist(Zero.getNode());
26728 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
26729 DCI.AddToWorklist(Res.getNode());
26730 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26735 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
26736 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26737 ((Subtarget.hasAVX512() &&
26738 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26739 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26740 (Subtarget.hasVLX() &&
26741 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26742 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26743 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26744 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26745 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26746 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26747 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26748 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26749 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26750 DCI.AddToWorklist(VPermMask.getNode());
26751 V1 = DAG.getBitcast(MaskVT, V1);
26752 DCI.AddToWorklist(V1.getNode());
26753 V2 = DAG.getBitcast(MaskVT, V2);
26754 DCI.AddToWorklist(V2.getNode());
26755 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
26756 DCI.AddToWorklist(Res.getNode());
26757 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26764 // See if we can combine a single input shuffle with zeros to a bit-mask,
26765 // which is much simpler than any shuffle.
26766 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
26767 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
26768 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
26769 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
26770 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
26771 SmallBitVector UndefElts(NumMaskElts, false);
26772 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
26773 for (unsigned i = 0; i != NumMaskElts; ++i) {
26775 if (M == SM_SentinelUndef) {
26776 UndefElts[i] = true;
26779 if (M == SM_SentinelZero)
26781 EltBits[i] = AllOnes;
26783 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
26784 DCI.AddToWorklist(BitMask.getNode());
26785 Res = DAG.getBitcast(MaskVT, V1);
26786 DCI.AddToWorklist(Res.getNode());
26787 unsigned AndOpcode =
26788 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
26789 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
26790 DCI.AddToWorklist(Res.getNode());
26791 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26796 // If we have a single input shuffle with different shuffle patterns in the
26797 // the 128-bit lanes use the variable mask to VPERMILPS.
26798 // TODO Combine other mask types at higher depths.
26799 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
26800 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26801 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
26802 SmallVector<SDValue, 16> VPermIdx;
26803 for (int M : Mask) {
26805 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
26806 VPermIdx.push_back(Idx);
26808 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
26809 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
26810 DCI.AddToWorklist(VPermMask.getNode());
26811 Res = DAG.getBitcast(MaskVT, V1);
26812 DCI.AddToWorklist(Res.getNode());
26813 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
26814 DCI.AddToWorklist(Res.getNode());
26815 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26820 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
26821 // to VPERMIL2PD/VPERMIL2PS.
26822 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
26823 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
26824 MaskVT == MVT::v8f32)) {
26825 // VPERMIL2 Operation.
26826 // Bits[3] - Match Bit.
26827 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
26828 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
26829 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
26830 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
26831 SmallVector<int, 8> VPerm2Idx;
26832 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
26833 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
26834 unsigned M2ZImm = 0;
26835 for (int M : Mask) {
26836 if (M == SM_SentinelUndef) {
26837 VPerm2Idx.push_back(-1);
26840 if (M == SM_SentinelZero) {
26842 VPerm2Idx.push_back(8);
26845 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
26846 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
26847 VPerm2Idx.push_back(Index);
26849 V1 = DAG.getBitcast(MaskVT, V1);
26850 DCI.AddToWorklist(V1.getNode());
26851 V2 = DAG.getBitcast(MaskVT, V2);
26852 DCI.AddToWorklist(V2.getNode());
26853 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
26854 DCI.AddToWorklist(VPerm2MaskOp.getNode());
26855 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
26856 DAG.getConstant(M2ZImm, DL, MVT::i8));
26857 DCI.AddToWorklist(Res.getNode());
26858 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26863 // If we have 3 or more shuffle instructions or a chain involving a variable
26864 // mask, we can replace them with a single PSHUFB instruction profitably.
26865 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
26866 // instructions, but in practice PSHUFB tends to be *very* fast so we're
26867 // more aggressive.
26868 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26869 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26870 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
26871 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
26872 SmallVector<SDValue, 16> PSHUFBMask;
26873 int NumBytes = RootVT.getSizeInBits() / 8;
26874 int Ratio = NumBytes / NumMaskElts;
26875 for (int i = 0; i < NumBytes; ++i) {
26876 int M = Mask[i / Ratio];
26877 if (M == SM_SentinelUndef) {
26878 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
26881 if (M == SM_SentinelZero) {
26882 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
26885 M = Ratio * M + i % Ratio;
26886 assert ((M / 16) == (i / 16) && "Lane crossing detected");
26887 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26889 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
26890 Res = DAG.getBitcast(ByteVT, V1);
26891 DCI.AddToWorklist(Res.getNode());
26892 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
26893 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
26894 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
26895 DCI.AddToWorklist(Res.getNode());
26896 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26901 // With XOP, if we have a 128-bit binary input shuffle we can always combine
26902 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
26903 // slower than PSHUFB on targets that support both.
26904 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
26905 Subtarget.hasXOP()) {
26906 // VPPERM Mask Operation
26907 // Bits[4:0] - Byte Index (0 - 31)
26908 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
26909 SmallVector<SDValue, 16> VPPERMMask;
26911 int Ratio = NumBytes / NumMaskElts;
26912 for (int i = 0; i < NumBytes; ++i) {
26913 int M = Mask[i / Ratio];
26914 if (M == SM_SentinelUndef) {
26915 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
26918 if (M == SM_SentinelZero) {
26919 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
26922 M = Ratio * M + i % Ratio;
26923 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26925 MVT ByteVT = MVT::v16i8;
26926 V1 = DAG.getBitcast(ByteVT, V1);
26927 DCI.AddToWorklist(V1.getNode());
26928 V2 = DAG.getBitcast(ByteVT, V2);
26929 DCI.AddToWorklist(V2.getNode());
26930 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
26931 DCI.AddToWorklist(VPPERMMaskOp.getNode());
26932 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
26933 DCI.AddToWorklist(Res.getNode());
26934 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26939 // Failed to find any combines.
26943 // Attempt to constant fold all of the constant source ops.
26944 // Returns true if the entire shuffle is folded to a constant.
26945 // TODO: Extend this to merge multiple constant Ops and update the mask.
26946 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
26947 ArrayRef<int> Mask, SDValue Root,
26948 bool HasVariableMask, SelectionDAG &DAG,
26949 TargetLowering::DAGCombinerInfo &DCI,
26950 const X86Subtarget &Subtarget) {
26951 MVT VT = Root.getSimpleValueType();
26953 unsigned SizeInBits = VT.getSizeInBits();
26954 unsigned NumMaskElts = Mask.size();
26955 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
26956 unsigned NumOps = Ops.size();
26958 // Extract constant bits from each source op.
26959 bool OneUseConstantOp = false;
26960 SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
26961 SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
26962 for (unsigned i = 0; i != NumOps; ++i) {
26963 SDValue SrcOp = Ops[i];
26964 OneUseConstantOp |= SrcOp.hasOneUse();
26965 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
26970 // Only fold if at least one of the constants is only used once or
26971 // the combined shuffle has included a variable mask shuffle, this
26972 // is to avoid constant pool bloat.
26973 if (!OneUseConstantOp && !HasVariableMask)
26976 // Shuffle the constant bits according to the mask.
26977 SmallBitVector UndefElts(NumMaskElts, false);
26978 SmallBitVector ZeroElts(NumMaskElts, false);
26979 SmallBitVector ConstantElts(NumMaskElts, false);
26980 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
26981 APInt::getNullValue(MaskSizeInBits));
26982 for (unsigned i = 0; i != NumMaskElts; ++i) {
26984 if (M == SM_SentinelUndef) {
26985 UndefElts[i] = true;
26987 } else if (M == SM_SentinelZero) {
26988 ZeroElts[i] = true;
26991 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
26993 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
26994 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
26996 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
26997 if (SrcUndefElts[SrcMaskIdx]) {
26998 UndefElts[i] = true;
27002 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27003 APInt &Bits = SrcEltBits[SrcMaskIdx];
27005 ZeroElts[i] = true;
27009 ConstantElts[i] = true;
27010 ConstantBitData[i] = Bits;
27012 assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
27014 // Create the constant data.
27016 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27017 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27019 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27021 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27024 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27025 DCI.AddToWorklist(CstOp.getNode());
27026 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27030 /// \brief Fully generic combining of x86 shuffle instructions.
27032 /// This should be the last combine run over the x86 shuffle instructions. Once
27033 /// they have been fully optimized, this will recursively consider all chains
27034 /// of single-use shuffle instructions, build a generic model of the cumulative
27035 /// shuffle operation, and check for simpler instructions which implement this
27036 /// operation. We use this primarily for two purposes:
27038 /// 1) Collapse generic shuffles to specialized single instructions when
27039 /// equivalent. In most cases, this is just an encoding size win, but
27040 /// sometimes we will collapse multiple generic shuffles into a single
27041 /// special-purpose shuffle.
27042 /// 2) Look for sequences of shuffle instructions with 3 or more total
27043 /// instructions, and replace them with the slightly more expensive SSSE3
27044 /// PSHUFB instruction if available. We do this as the last combining step
27045 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27046 /// a suitable short sequence of other instructions. The PSHUFB will either
27047 /// use a register or have to read from memory and so is slightly (but only
27048 /// slightly) more expensive than the other shuffle instructions.
27050 /// Because this is inherently a quadratic operation (for each shuffle in
27051 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27052 /// This should never be an issue in practice as the shuffle lowering doesn't
27053 /// produce sequences of more than 8 instructions.
27055 /// FIXME: We will currently miss some cases where the redundant shuffling
27056 /// would simplify under the threshold for PSHUFB formation because of
27057 /// combine-ordering. To fix this, we should do the redundant instruction
27058 /// combining in this recursive walk.
27059 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27060 int SrcOpIndex, SDValue Root,
27061 ArrayRef<int> RootMask,
27062 int Depth, bool HasVariableMask,
27064 TargetLowering::DAGCombinerInfo &DCI,
27065 const X86Subtarget &Subtarget) {
27066 // Bound the depth of our recursive combine because this is ultimately
27067 // quadratic in nature.
27071 // Directly rip through bitcasts to find the underlying operand.
27072 SDValue Op = SrcOps[SrcOpIndex];
27073 Op = peekThroughOneUseBitcasts(Op);
27075 MVT VT = Op.getSimpleValueType();
27076 if (!VT.isVector())
27077 return false; // Bail if we hit a non-vector.
27079 assert(Root.getSimpleValueType().isVector() &&
27080 "Shuffles operate on vector types!");
27081 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27082 "Can only combine shuffles of the same vector register size.");
27084 // Extract target shuffle mask and resolve sentinels and inputs.
27085 SDValue Input0, Input1;
27086 SmallVector<int, 16> OpMask;
27087 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
27090 // Add the inputs to the Ops list, avoiding duplicates.
27091 SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
27093 int InputIdx0 = -1, InputIdx1 = -1;
27094 for (int i = 0, e = Ops.size(); i < e; ++i) {
27095 SDValue BC = peekThroughBitcasts(Ops[i]);
27096 if (Input0 && BC == peekThroughBitcasts(Input0))
27098 if (Input1 && BC == peekThroughBitcasts(Input1))
27102 if (Input0 && InputIdx0 < 0) {
27103 InputIdx0 = SrcOpIndex;
27104 Ops[SrcOpIndex] = Input0;
27106 if (Input1 && InputIdx1 < 0) {
27107 InputIdx1 = Ops.size();
27108 Ops.push_back(Input1);
27111 assert(((RootMask.size() > OpMask.size() &&
27112 RootMask.size() % OpMask.size() == 0) ||
27113 (OpMask.size() > RootMask.size() &&
27114 OpMask.size() % RootMask.size() == 0) ||
27115 OpMask.size() == RootMask.size()) &&
27116 "The smaller number of elements must divide the larger.");
27117 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27118 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27119 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27120 assert(((RootRatio == 1 && OpRatio == 1) ||
27121 (RootRatio == 1) != (OpRatio == 1)) &&
27122 "Must not have a ratio for both incoming and op masks!");
27124 SmallVector<int, 16> Mask;
27125 Mask.reserve(MaskWidth);
27127 // Merge this shuffle operation's mask into our accumulated mask. Note that
27128 // this shuffle's mask will be the first applied to the input, followed by the
27129 // root mask to get us all the way to the root value arrangement. The reason
27130 // for this order is that we are recursing up the operation chain.
27131 for (int i = 0; i < MaskWidth; ++i) {
27132 int RootIdx = i / RootRatio;
27133 if (RootMask[RootIdx] < 0) {
27134 // This is a zero or undef lane, we're done.
27135 Mask.push_back(RootMask[RootIdx]);
27139 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27141 // Just insert the scaled root mask value if it references an input other
27142 // than the SrcOp we're currently inserting.
27143 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27144 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27145 Mask.push_back(RootMaskedIdx);
27149 RootMaskedIdx %= MaskWidth;
27151 int OpIdx = RootMaskedIdx / OpRatio;
27152 if (OpMask[OpIdx] < 0) {
27153 // The incoming lanes are zero or undef, it doesn't matter which ones we
27155 Mask.push_back(OpMask[OpIdx]);
27159 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27160 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27161 OpMaskedIdx %= MaskWidth;
27163 if (OpMask[OpIdx] < (int)OpMask.size()) {
27164 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27165 OpMaskedIdx += InputIdx0 * MaskWidth;
27167 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27168 OpMaskedIdx += InputIdx1 * MaskWidth;
27171 Mask.push_back(OpMaskedIdx);
27174 // Handle the all undef/zero cases early.
27175 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27176 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27179 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27180 // TODO - should we handle the mixed zero/undef case as well? Just returning
27181 // a zero mask will lose information on undef elements possibly reducing
27182 // future combine possibilities.
27183 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27184 Subtarget, DAG, SDLoc(Root)));
27188 // Remove unused shuffle source ops.
27189 SmallVector<SDValue, 8> UsedOps;
27190 for (int i = 0, e = Ops.size(); i < e; ++i) {
27191 int lo = UsedOps.size() * MaskWidth;
27192 int hi = lo + MaskWidth;
27193 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
27194 UsedOps.push_back(Ops[i]);
27197 for (int &M : Mask)
27201 assert(!UsedOps.empty() && "Shuffle with no inputs detected");
27204 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27206 // See if we can recurse into each shuffle source op (if it's a target shuffle).
27207 for (int i = 0, e = Ops.size(); i < e; ++i)
27208 if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
27209 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
27210 HasVariableMask, DAG, DCI, Subtarget))
27213 // Attempt to constant fold all of the constant source ops.
27214 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
27218 // We can only combine unary and binary shuffle mask cases.
27219 if (Ops.size() > 2)
27222 // Minor canonicalization of the accumulated shuffle mask to make it easier
27223 // to match below. All this does is detect masks with sequential pairs of
27224 // elements, and shrink them to the half-width mask. It does this in a loop
27225 // so it will reduce the size of the mask to the minimal width mask which
27226 // performs an equivalent shuffle.
27227 SmallVector<int, 16> WidenedMask;
27228 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
27229 Mask = std::move(WidenedMask);
27232 // Canonicalization of binary shuffle masks to improve pattern matching by
27233 // commuting the inputs.
27234 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
27235 ShuffleVectorSDNode::commuteMask(Mask);
27236 std::swap(Ops[0], Ops[1]);
27239 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
27243 /// \brief Get the PSHUF-style mask from PSHUF node.
27245 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
27246 /// PSHUF-style masks that can be reused with such instructions.
27247 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
27248 MVT VT = N.getSimpleValueType();
27249 SmallVector<int, 4> Mask;
27250 SmallVector<SDValue, 2> Ops;
27253 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
27257 // If we have more than 128-bits, only the low 128-bits of shuffle mask
27258 // matter. Check that the upper masks are repeats and remove them.
27259 if (VT.getSizeInBits() > 128) {
27260 int LaneElts = 128 / VT.getScalarSizeInBits();
27262 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
27263 for (int j = 0; j < LaneElts; ++j)
27264 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
27265 "Mask doesn't repeat in high 128-bit lanes!");
27267 Mask.resize(LaneElts);
27270 switch (N.getOpcode()) {
27271 case X86ISD::PSHUFD:
27273 case X86ISD::PSHUFLW:
27276 case X86ISD::PSHUFHW:
27277 Mask.erase(Mask.begin(), Mask.begin() + 4);
27278 for (int &M : Mask)
27282 llvm_unreachable("No valid shuffle instruction found!");
27286 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
27288 /// We walk up the chain and look for a combinable shuffle, skipping over
27289 /// shuffles that we could hoist this shuffle's transformation past without
27290 /// altering anything.
27292 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
27294 TargetLowering::DAGCombinerInfo &DCI) {
27295 assert(N.getOpcode() == X86ISD::PSHUFD &&
27296 "Called with something other than an x86 128-bit half shuffle!");
27299 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
27300 // of the shuffles in the chain so that we can form a fresh chain to replace
27302 SmallVector<SDValue, 8> Chain;
27303 SDValue V = N.getOperand(0);
27304 for (; V.hasOneUse(); V = V.getOperand(0)) {
27305 switch (V.getOpcode()) {
27307 return SDValue(); // Nothing combined!
27310 // Skip bitcasts as we always know the type for the target specific
27314 case X86ISD::PSHUFD:
27315 // Found another dword shuffle.
27318 case X86ISD::PSHUFLW:
27319 // Check that the low words (being shuffled) are the identity in the
27320 // dword shuffle, and the high words are self-contained.
27321 if (Mask[0] != 0 || Mask[1] != 1 ||
27322 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
27325 Chain.push_back(V);
27328 case X86ISD::PSHUFHW:
27329 // Check that the high words (being shuffled) are the identity in the
27330 // dword shuffle, and the low words are self-contained.
27331 if (Mask[2] != 2 || Mask[3] != 3 ||
27332 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
27335 Chain.push_back(V);
27338 case X86ISD::UNPCKL:
27339 case X86ISD::UNPCKH:
27340 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
27341 // shuffle into a preceding word shuffle.
27342 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
27343 V.getSimpleValueType().getVectorElementType() != MVT::i16)
27346 // Search for a half-shuffle which we can combine with.
27347 unsigned CombineOp =
27348 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
27349 if (V.getOperand(0) != V.getOperand(1) ||
27350 !V->isOnlyUserOf(V.getOperand(0).getNode()))
27352 Chain.push_back(V);
27353 V = V.getOperand(0);
27355 switch (V.getOpcode()) {
27357 return SDValue(); // Nothing to combine.
27359 case X86ISD::PSHUFLW:
27360 case X86ISD::PSHUFHW:
27361 if (V.getOpcode() == CombineOp)
27364 Chain.push_back(V);
27368 V = V.getOperand(0);
27372 } while (V.hasOneUse());
27375 // Break out of the loop if we break out of the switch.
27379 if (!V.hasOneUse())
27380 // We fell out of the loop without finding a viable combining instruction.
27383 // Merge this node's mask and our incoming mask.
27384 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27385 for (int &M : Mask)
27387 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
27388 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27390 // Rebuild the chain around this new shuffle.
27391 while (!Chain.empty()) {
27392 SDValue W = Chain.pop_back_val();
27394 if (V.getValueType() != W.getOperand(0).getValueType())
27395 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
27397 switch (W.getOpcode()) {
27399 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
27401 case X86ISD::UNPCKL:
27402 case X86ISD::UNPCKH:
27403 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
27406 case X86ISD::PSHUFD:
27407 case X86ISD::PSHUFLW:
27408 case X86ISD::PSHUFHW:
27409 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
27413 if (V.getValueType() != N.getValueType())
27414 V = DAG.getBitcast(N.getValueType(), V);
27416 // Return the new chain to replace N.
27420 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
27423 /// We walk up the chain, skipping shuffles of the other half and looking
27424 /// through shuffles which switch halves trying to find a shuffle of the same
27425 /// pair of dwords.
27426 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
27428 TargetLowering::DAGCombinerInfo &DCI) {
27430 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
27431 "Called with something other than an x86 128-bit half shuffle!");
27433 unsigned CombineOpcode = N.getOpcode();
27435 // Walk up a single-use chain looking for a combinable shuffle.
27436 SDValue V = N.getOperand(0);
27437 for (; V.hasOneUse(); V = V.getOperand(0)) {
27438 switch (V.getOpcode()) {
27440 return false; // Nothing combined!
27443 // Skip bitcasts as we always know the type for the target specific
27447 case X86ISD::PSHUFLW:
27448 case X86ISD::PSHUFHW:
27449 if (V.getOpcode() == CombineOpcode)
27452 // Other-half shuffles are no-ops.
27455 // Break out of the loop if we break out of the switch.
27459 if (!V.hasOneUse())
27460 // We fell out of the loop without finding a viable combining instruction.
27463 // Combine away the bottom node as its shuffle will be accumulated into
27464 // a preceding shuffle.
27465 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27467 // Record the old value.
27470 // Merge this node's mask and our incoming mask (adjusted to account for all
27471 // the pshufd instructions encountered).
27472 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27473 for (int &M : Mask)
27475 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
27476 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27478 // Check that the shuffles didn't cancel each other out. If not, we need to
27479 // combine to the new one.
27481 // Replace the combinable shuffle with the combined one, updating all users
27482 // so that we re-evaluate the chain here.
27483 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
27488 /// \brief Try to combine x86 target specific shuffles.
27489 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
27490 TargetLowering::DAGCombinerInfo &DCI,
27491 const X86Subtarget &Subtarget) {
27493 MVT VT = N.getSimpleValueType();
27494 SmallVector<int, 4> Mask;
27496 unsigned Opcode = N.getOpcode();
27498 case X86ISD::PSHUFD:
27499 case X86ISD::PSHUFLW:
27500 case X86ISD::PSHUFHW:
27501 Mask = getPSHUFShuffleMask(N);
27502 assert(Mask.size() == 4);
27504 case X86ISD::UNPCKL: {
27505 auto Op0 = N.getOperand(0);
27506 auto Op1 = N.getOperand(1);
27507 unsigned Opcode0 = Op0.getOpcode();
27508 unsigned Opcode1 = Op1.getOpcode();
27510 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
27511 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
27512 // TODO: Add other horizontal operations as required.
27513 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
27514 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
27516 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
27517 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
27518 // moves upper half elements into the lower half part. For example:
27520 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
27522 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
27524 // will be combined to:
27526 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
27528 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
27529 // happen due to advanced instructions.
27530 if (!VT.is128BitVector())
27533 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
27534 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
27536 unsigned NumElts = VT.getVectorNumElements();
27537 SmallVector<int, 8> ExpectedMask(NumElts, -1);
27538 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
27541 auto ShufOp = Op1.getOperand(0);
27542 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
27543 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
27547 case X86ISD::BLENDI: {
27548 SDValue V0 = N->getOperand(0);
27549 SDValue V1 = N->getOperand(1);
27550 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
27551 "Unexpected input vector types");
27553 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
27554 // operands and changing the mask to 1. This saves us a bunch of
27555 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
27556 // x86InstrInfo knows how to commute this back after instruction selection
27557 // if it would help register allocation.
27559 // TODO: If optimizing for size or a processor that doesn't suffer from
27560 // partial register update stalls, this should be transformed into a MOVSD
27561 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
27563 if (VT == MVT::v2f64)
27564 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
27565 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
27566 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
27567 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
27572 case X86ISD::MOVSD:
27573 case X86ISD::MOVSS: {
27574 bool isFloat = VT.isFloatingPoint();
27575 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
27576 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
27577 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
27578 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
27579 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
27580 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
27581 assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
27583 // We often lower to MOVSD/MOVSS from integer as well as native float
27584 // types; remove unnecessary domain-crossing bitcasts if we can to make it
27585 // easier to combine shuffles later on. We've already accounted for the
27586 // domain switching cost when we decided to lower with it.
27587 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
27588 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
27589 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
27590 V0 = DAG.getBitcast(NewVT, V0);
27591 V1 = DAG.getBitcast(NewVT, V1);
27592 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
27597 case X86ISD::INSERTPS: {
27598 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
27599 SDValue Op0 = N.getOperand(0);
27600 SDValue Op1 = N.getOperand(1);
27601 SDValue Op2 = N.getOperand(2);
27602 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
27603 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
27604 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
27605 unsigned ZeroMask = InsertPSMask & 0xF;
27607 // If we zero out all elements from Op0 then we don't need to reference it.
27608 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
27609 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
27610 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27612 // If we zero out the element from Op1 then we don't need to reference it.
27613 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
27614 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27615 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27617 // Attempt to merge insertps Op1 with an inner target shuffle node.
27618 SmallVector<int, 8> TargetMask1;
27619 SmallVector<SDValue, 2> Ops1;
27620 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
27621 int M = TargetMask1[SrcIdx];
27622 if (isUndefOrZero(M)) {
27623 // Zero/UNDEF insertion - zero out element and remove dependency.
27624 InsertPSMask |= (1u << DstIdx);
27625 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27626 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27628 // Update insertps mask srcidx and reference the source input directly.
27629 assert(0 <= M && M < 8 && "Shuffle index out of range");
27630 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
27631 Op1 = Ops1[M < 4 ? 0 : 1];
27632 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27633 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27636 // Attempt to merge insertps Op0 with an inner target shuffle node.
27637 SmallVector<int, 8> TargetMask0;
27638 SmallVector<SDValue, 2> Ops0;
27639 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
27642 bool Updated = false;
27643 bool UseInput00 = false;
27644 bool UseInput01 = false;
27645 for (int i = 0; i != 4; ++i) {
27646 int M = TargetMask0[i];
27647 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
27648 // No change if element is already zero or the inserted element.
27650 } else if (isUndefOrZero(M)) {
27651 // If the target mask is undef/zero then we must zero the element.
27652 InsertPSMask |= (1u << i);
27657 // The input vector element must be inline.
27658 if (M != i && M != (i + 4))
27661 // Determine which inputs of the target shuffle we're using.
27662 UseInput00 |= (0 <= M && M < 4);
27663 UseInput01 |= (4 <= M);
27666 // If we're not using both inputs of the target shuffle then use the
27667 // referenced input directly.
27668 if (UseInput00 && !UseInput01) {
27671 } else if (!UseInput00 && UseInput01) {
27677 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27678 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27686 // Nuke no-op shuffles that show up after combining.
27687 if (isNoopShuffleMask(Mask))
27688 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27690 // Look for simplifications involving one or two shuffle instructions.
27691 SDValue V = N.getOperand(0);
27692 switch (N.getOpcode()) {
27695 case X86ISD::PSHUFLW:
27696 case X86ISD::PSHUFHW:
27697 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
27699 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
27700 return SDValue(); // We combined away this shuffle, so we're done.
27702 // See if this reduces to a PSHUFD which is no more expensive and can
27703 // combine with more operations. Note that it has to at least flip the
27704 // dwords as otherwise it would have been removed as a no-op.
27705 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
27706 int DMask[] = {0, 1, 2, 3};
27707 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
27708 DMask[DOffset + 0] = DOffset + 1;
27709 DMask[DOffset + 1] = DOffset + 0;
27710 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27711 V = DAG.getBitcast(DVT, V);
27712 DCI.AddToWorklist(V.getNode());
27713 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
27714 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
27715 DCI.AddToWorklist(V.getNode());
27716 return DAG.getBitcast(VT, V);
27719 // Look for shuffle patterns which can be implemented as a single unpack.
27720 // FIXME: This doesn't handle the location of the PSHUFD generically, and
27721 // only works when we have a PSHUFD followed by two half-shuffles.
27722 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
27723 (V.getOpcode() == X86ISD::PSHUFLW ||
27724 V.getOpcode() == X86ISD::PSHUFHW) &&
27725 V.getOpcode() != N.getOpcode() &&
27727 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
27728 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
27729 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27730 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
27731 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27732 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27734 for (int i = 0; i < 4; ++i) {
27735 WordMask[i + NOffset] = Mask[i] + NOffset;
27736 WordMask[i + VOffset] = VMask[i] + VOffset;
27738 // Map the word mask through the DWord mask.
27740 for (int i = 0; i < 8; ++i)
27741 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
27742 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
27743 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
27744 // We can replace all three shuffles with an unpack.
27745 V = DAG.getBitcast(VT, D.getOperand(0));
27746 DCI.AddToWorklist(V.getNode());
27747 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
27756 case X86ISD::PSHUFD:
27757 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
27766 /// \brief Try to combine a shuffle into a target-specific add-sub node.
27768 /// We combine this directly on the abstract vector shuffle nodes so it is
27769 /// easier to generically match. We also insert dummy vector shuffle nodes for
27770 /// the operands which explicitly discard the lanes which are unused by this
27771 /// operation to try to flow through the rest of the combiner the fact that
27772 /// they're unused.
27773 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
27774 SelectionDAG &DAG) {
27776 EVT VT = N->getValueType(0);
27777 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
27778 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
27781 // We only handle target-independent shuffles.
27782 // FIXME: It would be easy and harmless to use the target shuffle mask
27783 // extraction tool to support more.
27784 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
27787 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
27788 SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
27790 SDValue V1 = N->getOperand(0);
27791 SDValue V2 = N->getOperand(1);
27793 // We require the first shuffle operand to be the FSUB node, and the second to
27794 // be the FADD node.
27795 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
27796 ShuffleVectorSDNode::commuteMask(Mask);
27798 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
27801 // If there are other uses of these operations we can't fold them.
27802 if (!V1->hasOneUse() || !V2->hasOneUse())
27805 // Ensure that both operations have the same operands. Note that we can
27806 // commute the FADD operands.
27807 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
27808 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
27809 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
27812 // We're looking for blends between FADD and FSUB nodes. We insist on these
27813 // nodes being lined up in a specific expected pattern.
27814 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
27815 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
27816 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
27819 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
27822 // We are looking for a shuffle where both sources are concatenated with undef
27823 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
27824 // if we can express this as a single-source shuffle, that's preferable.
27825 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
27826 const X86Subtarget &Subtarget) {
27827 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
27830 EVT VT = N->getValueType(0);
27832 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
27833 if (!VT.is128BitVector() && !VT.is256BitVector())
27836 if (VT.getVectorElementType() != MVT::i32 &&
27837 VT.getVectorElementType() != MVT::i64 &&
27838 VT.getVectorElementType() != MVT::f32 &&
27839 VT.getVectorElementType() != MVT::f64)
27842 SDValue N0 = N->getOperand(0);
27843 SDValue N1 = N->getOperand(1);
27845 // Check that both sources are concats with undef.
27846 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
27847 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
27848 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
27849 !N1.getOperand(1).isUndef())
27852 // Construct the new shuffle mask. Elements from the first source retain their
27853 // index, but elements from the second source no longer need to skip an undef.
27854 SmallVector<int, 8> Mask;
27855 int NumElts = VT.getVectorNumElements();
27857 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27858 for (int Elt : SVOp->getMask())
27859 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
27862 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
27864 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
27867 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
27868 TargetLowering::DAGCombinerInfo &DCI,
27869 const X86Subtarget &Subtarget) {
27871 EVT VT = N->getValueType(0);
27873 // Don't create instructions with illegal types after legalize types has run.
27874 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27875 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
27878 // If we have legalized the vector types, look for blends of FADD and FSUB
27879 // nodes that we can fuse into an ADDSUB node.
27880 if (TLI.isTypeLegal(VT))
27881 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
27884 // During Type Legalization, when promoting illegal vector types,
27885 // the backend might introduce new shuffle dag nodes and bitcasts.
27887 // This code performs the following transformation:
27888 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
27889 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
27891 // We do this only if both the bitcast and the BINOP dag nodes have
27892 // one use. Also, perform this transformation only if the new binary
27893 // operation is legal. This is to avoid introducing dag nodes that
27894 // potentially need to be further expanded (or custom lowered) into a
27895 // less optimal sequence of dag nodes.
27896 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
27897 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
27898 N->getOperand(0).getOpcode() == ISD::BITCAST &&
27899 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
27900 SDValue N0 = N->getOperand(0);
27901 SDValue N1 = N->getOperand(1);
27903 SDValue BC0 = N0.getOperand(0);
27904 EVT SVT = BC0.getValueType();
27905 unsigned Opcode = BC0.getOpcode();
27906 unsigned NumElts = VT.getVectorNumElements();
27908 if (BC0.hasOneUse() && SVT.isVector() &&
27909 SVT.getVectorNumElements() * 2 == NumElts &&
27910 TLI.isOperationLegal(Opcode, VT)) {
27911 bool CanFold = false;
27917 // isOperationLegal lies for integer ops on floating point types.
27918 CanFold = VT.isInteger();
27923 // isOperationLegal lies for floating point ops on integer types.
27924 CanFold = VT.isFloatingPoint();
27928 unsigned SVTNumElts = SVT.getVectorNumElements();
27929 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27930 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
27931 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
27932 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
27933 CanFold = SVOp->getMaskElt(i) < 0;
27936 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
27937 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
27938 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
27939 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
27944 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
27945 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
27946 // consecutive, non-overlapping, and in the right order.
27947 SmallVector<SDValue, 16> Elts;
27948 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
27949 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
27951 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
27954 // For AVX2, we sometimes want to combine
27955 // (vector_shuffle <mask> (concat_vectors t1, undef)
27956 // (concat_vectors t2, undef))
27958 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
27959 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
27960 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
27963 if (isTargetShuffle(N->getOpcode())) {
27965 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
27968 // Try recursively combining arbitrary sequences of x86 shuffle
27969 // instructions into higher-order shuffles. We do this after combining
27970 // specific PSHUF instruction sequences into their minimal form so that we
27971 // can evaluate how many specialized shuffle instructions are involved in
27972 // a particular chain.
27973 SmallVector<int, 1> NonceMask; // Just a placeholder.
27974 NonceMask.push_back(0);
27975 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
27976 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
27978 return SDValue(); // This routine will use CombineTo to replace N.
27984 /// Check if a vector extract from a target-specific shuffle of a load can be
27985 /// folded into a single element load.
27986 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
27987 /// shuffles have been custom lowered so we need to handle those here.
27988 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
27989 TargetLowering::DAGCombinerInfo &DCI) {
27990 if (DCI.isBeforeLegalizeOps())
27993 SDValue InVec = N->getOperand(0);
27994 SDValue EltNo = N->getOperand(1);
27995 EVT EltVT = N->getValueType(0);
27997 if (!isa<ConstantSDNode>(EltNo))
28000 EVT OriginalVT = InVec.getValueType();
28002 if (InVec.getOpcode() == ISD::BITCAST) {
28003 // Don't duplicate a load with other uses.
28004 if (!InVec.hasOneUse())
28006 EVT BCVT = InVec.getOperand(0).getValueType();
28007 if (!BCVT.isVector() ||
28008 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28010 InVec = InVec.getOperand(0);
28013 EVT CurrentVT = InVec.getValueType();
28015 if (!isTargetShuffle(InVec.getOpcode()))
28018 // Don't duplicate a load with other uses.
28019 if (!InVec.hasOneUse())
28022 SmallVector<int, 16> ShuffleMask;
28023 SmallVector<SDValue, 2> ShuffleOps;
28025 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28026 ShuffleOps, ShuffleMask, UnaryShuffle))
28029 // Select the input vector, guarding against out of range extract vector.
28030 unsigned NumElems = CurrentVT.getVectorNumElements();
28031 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28032 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28034 if (Idx == SM_SentinelZero)
28035 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28036 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28037 if (Idx == SM_SentinelUndef)
28038 return DAG.getUNDEF(EltVT);
28040 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28041 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28044 // If inputs to shuffle are the same for both ops, then allow 2 uses
28045 unsigned AllowedUses =
28046 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28048 if (LdNode.getOpcode() == ISD::BITCAST) {
28049 // Don't duplicate a load with other uses.
28050 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28053 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28054 LdNode = LdNode.getOperand(0);
28057 if (!ISD::isNormalLoad(LdNode.getNode()))
28060 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28062 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28065 // If there's a bitcast before the shuffle, check if the load type and
28066 // alignment is valid.
28067 unsigned Align = LN0->getAlignment();
28068 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28069 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28070 EltVT.getTypeForEVT(*DAG.getContext()));
28072 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28075 // All checks match so transform back to vector_shuffle so that DAG combiner
28076 // can finish the job
28079 // Create shuffle node taking into account the case that its a unary shuffle
28080 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28081 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28083 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28084 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28088 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28089 const X86Subtarget &Subtarget) {
28090 SDValue N0 = N->getOperand(0);
28091 EVT VT = N->getValueType(0);
28093 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
28094 // special and don't usually play with other vector types, it's better to
28095 // handle them early to be sure we emit efficient code by avoiding
28096 // store-load conversions.
28097 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28098 N0.getValueType() == MVT::v2i32 &&
28099 isNullConstant(N0.getOperand(1))) {
28100 SDValue N00 = N0->getOperand(0);
28101 if (N00.getValueType() == MVT::i32)
28102 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28105 // Convert a bitcasted integer logic operation that has one bitcasted
28106 // floating-point operand into a floating-point logic operation. This may
28107 // create a load of a constant, but that is cheaper than materializing the
28108 // constant in an integer register and transferring it to an SSE register or
28109 // transferring the SSE operand to integer register and back.
28111 switch (N0.getOpcode()) {
28112 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28113 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28114 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28115 default: return SDValue();
28118 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28119 (Subtarget.hasSSE2() && VT == MVT::f64)))
28122 SDValue LogicOp0 = N0.getOperand(0);
28123 SDValue LogicOp1 = N0.getOperand(1);
28126 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28127 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28128 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28129 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
28130 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
28131 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
28133 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
28134 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
28135 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
28136 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
28137 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
28138 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
28144 // Match a binop + shuffle pyramid that represents a horizontal reduction over
28145 // the elements of a vector.
28146 // Returns the vector that is being reduced on, or SDValue() if a reduction
28147 // was not matched.
28148 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
28149 // The pattern must end in an extract from index 0.
28150 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
28151 !isNullConstant(Extract->getOperand(1)))
28155 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
28157 SDValue Op = Extract->getOperand(0);
28158 // At each stage, we're looking for something that looks like:
28159 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
28160 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
28161 // i32 undef, i32 undef, i32 undef, i32 undef>
28162 // %a = binop <8 x i32> %op, %s
28163 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
28164 // we expect something like:
28165 // <4,5,6,7,u,u,u,u>
28166 // <2,3,u,u,u,u,u,u>
28167 // <1,u,u,u,u,u,u,u>
28168 for (unsigned i = 0; i < Stages; ++i) {
28169 if (Op.getOpcode() != BinOp)
28172 ShuffleVectorSDNode *Shuffle =
28173 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
28175 Op = Op.getOperand(1);
28177 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
28178 Op = Op.getOperand(0);
28181 // The first operand of the shuffle should be the same as the other operand
28183 if (!Shuffle || (Shuffle->getOperand(0) != Op))
28186 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
28187 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
28188 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
28195 // Given a select, detect the following pattern:
28196 // 1: %2 = zext <N x i8> %0 to <N x i32>
28197 // 2: %3 = zext <N x i8> %1 to <N x i32>
28198 // 3: %4 = sub nsw <N x i32> %2, %3
28199 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
28200 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
28201 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
28202 // This is useful as it is the input into a SAD pattern.
28203 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
28205 // Check the condition of the select instruction is greater-than.
28206 SDValue SetCC = Select->getOperand(0);
28207 if (SetCC.getOpcode() != ISD::SETCC)
28209 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
28210 if (CC != ISD::SETGT)
28213 SDValue SelectOp1 = Select->getOperand(1);
28214 SDValue SelectOp2 = Select->getOperand(2);
28216 // The second operand of the select should be the negation of the first
28217 // operand, which is implemented as 0 - SelectOp1.
28218 if (!(SelectOp2.getOpcode() == ISD::SUB &&
28219 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
28220 SelectOp2.getOperand(1) == SelectOp1))
28223 // The first operand of SetCC is the first operand of the select, which is the
28224 // difference between the two input vectors.
28225 if (SetCC.getOperand(0) != SelectOp1)
28228 // The second operand of the comparison can be either -1 or 0.
28229 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
28230 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
28233 // The first operand of the select is the difference between the two input
28235 if (SelectOp1.getOpcode() != ISD::SUB)
28238 Op0 = SelectOp1.getOperand(0);
28239 Op1 = SelectOp1.getOperand(1);
28241 // Check if the operands of the sub are zero-extended from vectors of i8.
28242 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
28243 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
28244 Op1.getOpcode() != ISD::ZERO_EXTEND ||
28245 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
28251 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
28253 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
28254 const SDValue &Zext1, const SDLoc &DL) {
28256 // Find the appropriate width for the PSADBW.
28257 EVT InVT = Zext0.getOperand(0).getValueType();
28258 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
28260 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
28261 // fill in the missing vector elements with 0.
28262 unsigned NumConcat = RegSize / InVT.getSizeInBits();
28263 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
28264 Ops[0] = Zext0.getOperand(0);
28265 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
28266 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28267 Ops[0] = Zext1.getOperand(0);
28268 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28270 // Actually build the SAD
28271 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
28272 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
28275 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
28276 const X86Subtarget &Subtarget) {
28277 // PSADBW is only supported on SSE2 and up.
28278 if (!Subtarget.hasSSE2())
28281 // Verify the type we're extracting from is appropriate
28282 // TODO: There's nothing special about i32, any integer type above i16 should
28283 // work just as well.
28284 EVT VT = Extract->getOperand(0).getValueType();
28285 if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
28288 unsigned RegSize = 128;
28289 if (Subtarget.hasBWI())
28291 else if (Subtarget.hasAVX2())
28294 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
28295 // TODO: We should be able to handle larger vectors by splitting them before
28296 // feeding them into several SADs, and then reducing over those.
28297 if (VT.getSizeInBits() / 4 > RegSize)
28300 // Match shuffle + add pyramid.
28301 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
28303 // If there was a match, we want Root to be a select that is the root of an
28304 // abs-diff pattern.
28305 if (!Root || (Root.getOpcode() != ISD::VSELECT))
28308 // Check whether we have an abs-diff pattern feeding into the select.
28309 SDValue Zext0, Zext1;
28310 if (!detectZextAbsDiff(Root, Zext0, Zext1))
28313 // Create the SAD instruction
28315 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
28317 // If the original vector was wider than 8 elements, sum over the results
28318 // in the SAD vector.
28319 unsigned Stages = Log2_32(VT.getVectorNumElements());
28320 MVT SadVT = SAD.getSimpleValueType();
28322 unsigned SadElems = SadVT.getVectorNumElements();
28324 for(unsigned i = Stages - 3; i > 0; --i) {
28325 SmallVector<int, 16> Mask(SadElems, -1);
28326 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
28327 Mask[j] = MaskEnd + j;
28330 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
28331 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
28335 // Return the lowest i32.
28336 MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
28337 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
28338 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
28339 Extract->getOperand(1));
28342 /// Detect vector gather/scatter index generation and convert it from being a
28343 /// bunch of shuffles and extracts into a somewhat faster sequence.
28344 /// For i686, the best sequence is apparently storing the value and loading
28345 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
28346 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
28347 TargetLowering::DAGCombinerInfo &DCI,
28348 const X86Subtarget &Subtarget) {
28349 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
28352 SDValue InputVector = N->getOperand(0);
28353 SDLoc dl(InputVector);
28354 // Detect mmx to i32 conversion through a v2i32 elt extract.
28355 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
28356 N->getValueType(0) == MVT::i32 &&
28357 InputVector.getValueType() == MVT::v2i32 &&
28358 isa<ConstantSDNode>(N->getOperand(1)) &&
28359 N->getConstantOperandVal(1) == 0) {
28360 SDValue MMXSrc = InputVector.getOperand(0);
28362 // The bitcast source is a direct mmx result.
28363 if (MMXSrc.getValueType() == MVT::x86mmx)
28364 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
28367 EVT VT = N->getValueType(0);
28369 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
28370 InputVector.getOpcode() == ISD::BITCAST &&
28371 isa<ConstantSDNode>(InputVector.getOperand(0))) {
28372 uint64_t ExtractedElt =
28373 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
28374 uint64_t InputValue =
28375 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
28376 uint64_t Res = (InputValue >> ExtractedElt) & 1;
28377 return DAG.getConstant(Res, dl, MVT::i1);
28380 // Check whether this extract is the root of a sum of absolute differences
28381 // pattern. This has to be done here because we really want it to happen
28382 // pre-legalization,
28383 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
28386 // Only operate on vectors of 4 elements, where the alternative shuffling
28387 // gets to be more expensive.
28388 if (InputVector.getValueType() != MVT::v4i32)
28391 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
28392 // single use which is a sign-extend or zero-extend, and all elements are
28394 SmallVector<SDNode *, 4> Uses;
28395 unsigned ExtractedElements = 0;
28396 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
28397 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
28398 if (UI.getUse().getResNo() != InputVector.getResNo())
28401 SDNode *Extract = *UI;
28402 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
28405 if (Extract->getValueType(0) != MVT::i32)
28407 if (!Extract->hasOneUse())
28409 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
28410 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
28412 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
28415 // Record which element was extracted.
28416 ExtractedElements |=
28417 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
28419 Uses.push_back(Extract);
28422 // If not all the elements were used, this may not be worthwhile.
28423 if (ExtractedElements != 15)
28426 // Ok, we've now decided to do the transformation.
28427 // If 64-bit shifts are legal, use the extract-shift sequence,
28428 // otherwise bounce the vector off the cache.
28429 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28432 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
28433 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
28434 auto &DL = DAG.getDataLayout();
28435 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
28436 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28437 DAG.getConstant(0, dl, VecIdxTy));
28438 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28439 DAG.getConstant(1, dl, VecIdxTy));
28441 SDValue ShAmt = DAG.getConstant(
28442 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
28443 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
28444 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28445 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
28446 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
28447 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28448 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
28450 // Store the value to a temporary stack slot.
28451 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
28452 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
28453 MachinePointerInfo());
28455 EVT ElementType = InputVector.getValueType().getVectorElementType();
28456 unsigned EltSize = ElementType.getSizeInBits() / 8;
28458 // Replace each use (extract) with a load of the appropriate element.
28459 for (unsigned i = 0; i < 4; ++i) {
28460 uint64_t Offset = EltSize * i;
28461 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28462 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
28464 SDValue ScalarAddr =
28465 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
28467 // Load the scalar.
28469 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
28473 // Replace the extracts
28474 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
28475 UE = Uses.end(); UI != UE; ++UI) {
28476 SDNode *Extract = *UI;
28478 SDValue Idx = Extract->getOperand(1);
28479 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
28480 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
28483 // The replacement was made in place; don't return anything.
28487 /// If a vector select has an operand that is -1 or 0, simplify the select to a
28488 /// bitwise logic operation.
28489 static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
28490 const X86Subtarget &Subtarget) {
28491 SDValue Cond = N->getOperand(0);
28492 SDValue LHS = N->getOperand(1);
28493 SDValue RHS = N->getOperand(2);
28494 EVT VT = LHS.getValueType();
28495 EVT CondVT = Cond.getValueType();
28497 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28499 if (N->getOpcode() != ISD::VSELECT)
28502 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28503 // Check if the first operand is all zeros.This situation only
28504 // applies to avx512.
28505 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
28506 //Invert the cond to not(cond) : xor(op,allones)=not(op)
28507 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28508 DAG.getConstant(1, DL, Cond.getValueType()));
28509 //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
28510 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
28512 assert(CondVT.isVector() && "Vector select expects a vector selector!");
28514 // To use the condition operand as a bitwise mask, it must have elements that
28515 // are the same size as the select elements. Ie, the condition operand must
28516 // have already been promoted from the IR select condition type <N x i1>.
28517 // Don't check if the types themselves are equal because that excludes
28518 // vector floating-point selects.
28519 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
28522 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
28523 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
28525 // Try to invert the condition if true value is not all 1s and false value is
28527 if (!TValIsAllOnes && !FValIsAllZeros &&
28528 // Check if the selector will be produced by CMPP*/PCMP*.
28529 Cond.getOpcode() == ISD::SETCC &&
28530 // Check if SETCC has already been promoted.
28531 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
28533 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28534 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
28536 if (TValIsAllZeros || FValIsAllOnes) {
28537 SDValue CC = Cond.getOperand(2);
28538 ISD::CondCode NewCC =
28539 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
28540 Cond.getOperand(0).getValueType().isInteger());
28541 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
28543 std::swap(LHS, RHS);
28544 TValIsAllOnes = FValIsAllOnes;
28545 FValIsAllZeros = TValIsAllZeros;
28549 if (!TValIsAllOnes && !FValIsAllZeros)
28553 if (TValIsAllOnes && FValIsAllZeros)
28555 else if (TValIsAllOnes)
28556 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
28557 else if (FValIsAllZeros)
28558 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
28560 return DAG.getBitcast(VT, Ret);
28563 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
28564 SDValue Cond = N->getOperand(0);
28565 SDValue LHS = N->getOperand(1);
28566 SDValue RHS = N->getOperand(2);
28569 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
28570 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
28571 if (!TrueC || !FalseC)
28574 // Don't do this for crazy integer types.
28575 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
28578 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
28579 // so that TrueC (the true value) is larger than FalseC.
28580 bool NeedsCondInvert = false;
28581 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
28582 // Efficiently invertible.
28583 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
28584 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
28585 isa<ConstantSDNode>(Cond.getOperand(1))))) {
28586 NeedsCondInvert = true;
28587 std::swap(TrueC, FalseC);
28590 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
28591 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
28592 if (NeedsCondInvert) // Invert the condition if needed.
28593 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28594 DAG.getConstant(1, DL, Cond.getValueType()));
28596 // Zero extend the condition if needed.
28597 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
28599 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
28600 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
28601 DAG.getConstant(ShAmt, DL, MVT::i8));
28604 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
28605 if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
28606 if (NeedsCondInvert) // Invert the condition if needed.
28607 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28608 DAG.getConstant(1, DL, Cond.getValueType()));
28610 // Zero extend the condition if needed.
28611 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28612 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28613 SDValue(FalseC, 0));
28616 // Optimize cases that will turn into an LEA instruction. This requires
28617 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
28618 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
28619 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
28620 if (N->getValueType(0) == MVT::i32)
28621 Diff = (unsigned)Diff;
28623 bool isFastMultiplier = false;
28625 switch ((unsigned char)Diff) {
28628 case 1: // result = add base, cond
28629 case 2: // result = lea base( , cond*2)
28630 case 3: // result = lea base(cond, cond*2)
28631 case 4: // result = lea base( , cond*4)
28632 case 5: // result = lea base(cond, cond*4)
28633 case 8: // result = lea base( , cond*8)
28634 case 9: // result = lea base(cond, cond*8)
28635 isFastMultiplier = true;
28640 if (isFastMultiplier) {
28641 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
28642 if (NeedsCondInvert) // Invert the condition if needed.
28643 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28644 DAG.getConstant(1, DL, Cond.getValueType()));
28646 // Zero extend the condition if needed.
28647 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28648 // Scale the condition by the difference.
28650 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
28651 DAG.getConstant(Diff, DL, Cond.getValueType()));
28653 // Add the base if non-zero.
28654 if (FalseC->getAPIntValue() != 0)
28655 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28656 SDValue(FalseC, 0));
28664 // If this is a bitcasted op that can be represented as another type, push the
28665 // the bitcast to the inputs. This allows more opportunities for pattern
28666 // matching masked instructions. This is called when we know that the operation
28667 // is used as one of the inputs of a vselect.
28668 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
28669 TargetLowering::DAGCombinerInfo &DCI) {
28670 // Make sure we have a bitcast.
28671 if (OrigOp.getOpcode() != ISD::BITCAST)
28674 SDValue Op = OrigOp.getOperand(0);
28676 // If the operation is used by anything other than the bitcast, we shouldn't
28677 // do this combine as that would replicate the operation.
28678 if (!Op.hasOneUse())
28681 MVT VT = OrigOp.getSimpleValueType();
28682 MVT EltVT = VT.getVectorElementType();
28683 SDLoc DL(Op.getNode());
28685 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
28687 Op0 = DAG.getBitcast(VT, Op0);
28688 DCI.AddToWorklist(Op0.getNode());
28689 Op1 = DAG.getBitcast(VT, Op1);
28690 DCI.AddToWorklist(Op1.getNode());
28691 DCI.CombineTo(OrigOp.getNode(),
28692 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
28696 unsigned Opcode = Op.getOpcode();
28698 case X86ISD::PALIGNR:
28699 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
28700 if (!VT.is128BitVector())
28702 Opcode = X86ISD::VALIGN;
28704 case X86ISD::VALIGN: {
28705 if (EltVT != MVT::i32 && EltVT != MVT::i64)
28707 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28708 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28709 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
28710 unsigned EltSize = EltVT.getSizeInBits();
28711 // Make sure we can represent the same shift with the new VT.
28712 if ((ShiftAmt % EltSize) != 0)
28714 Imm = ShiftAmt / EltSize;
28715 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28716 DAG.getConstant(Imm, DL, MVT::i8));
28718 case X86ISD::SHUF128: {
28719 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
28721 // Only change element size, not type.
28722 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
28724 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28727 case ISD::INSERT_SUBVECTOR: {
28728 unsigned EltSize = EltVT.getSizeInBits();
28729 if (EltSize != 32 && EltSize != 64)
28731 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28732 // Only change element size, not type.
28733 if (VT.isInteger() != OpEltVT.isInteger())
28735 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28736 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
28737 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
28738 DCI.AddToWorklist(Op0.getNode());
28739 // Op1 needs to be bitcasted to a smaller vector with the same element type.
28740 SDValue Op1 = Op.getOperand(1);
28741 MVT Op1VT = MVT::getVectorVT(EltVT,
28742 Op1.getSimpleValueType().getSizeInBits() / EltSize);
28743 Op1 = DAG.getBitcast(Op1VT, Op1);
28744 DCI.AddToWorklist(Op1.getNode());
28745 DCI.CombineTo(OrigOp.getNode(),
28746 DAG.getNode(Opcode, DL, VT, Op0, Op1,
28747 DAG.getConstant(Imm, DL, MVT::i8)));
28750 case ISD::EXTRACT_SUBVECTOR: {
28751 unsigned EltSize = EltVT.getSizeInBits();
28752 if (EltSize != 32 && EltSize != 64)
28754 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28755 // Only change element size, not type.
28756 if (VT.isInteger() != OpEltVT.isInteger())
28758 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
28759 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
28760 // Op0 needs to be bitcasted to a larger vector with the same element type.
28761 SDValue Op0 = Op.getOperand(0);
28762 MVT Op0VT = MVT::getVectorVT(EltVT,
28763 Op0.getSimpleValueType().getSizeInBits() / EltSize);
28764 Op0 = DAG.getBitcast(Op0VT, Op0);
28765 DCI.AddToWorklist(Op0.getNode());
28766 DCI.CombineTo(OrigOp.getNode(),
28767 DAG.getNode(Opcode, DL, VT, Op0,
28768 DAG.getConstant(Imm, DL, MVT::i8)));
28776 /// Do target-specific dag combines on SELECT and VSELECT nodes.
28777 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
28778 TargetLowering::DAGCombinerInfo &DCI,
28779 const X86Subtarget &Subtarget) {
28781 SDValue Cond = N->getOperand(0);
28782 // Get the LHS/RHS of the select.
28783 SDValue LHS = N->getOperand(1);
28784 SDValue RHS = N->getOperand(2);
28785 EVT VT = LHS.getValueType();
28786 EVT CondVT = Cond.getValueType();
28787 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28789 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
28790 // instructions match the semantics of the common C idiom x<y?x:y but not
28791 // x<=y?x:y, because of how they handle negative zero (which can be
28792 // ignored in unsafe-math mode).
28793 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
28794 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
28795 VT != MVT::f80 && VT != MVT::f128 &&
28796 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
28797 (Subtarget.hasSSE2() ||
28798 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
28799 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28801 unsigned Opcode = 0;
28802 // Check for x CC y ? x : y.
28803 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28804 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28808 // Converting this to a min would handle NaNs incorrectly, and swapping
28809 // the operands would cause it to handle comparisons between positive
28810 // and negative zero incorrectly.
28811 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28812 if (!DAG.getTarget().Options.UnsafeFPMath &&
28813 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28815 std::swap(LHS, RHS);
28817 Opcode = X86ISD::FMIN;
28820 // Converting this to a min would handle comparisons between positive
28821 // and negative zero incorrectly.
28822 if (!DAG.getTarget().Options.UnsafeFPMath &&
28823 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28825 Opcode = X86ISD::FMIN;
28828 // Converting this to a min would handle both negative zeros and NaNs
28829 // incorrectly, but we can swap the operands to fix both.
28830 std::swap(LHS, RHS);
28834 Opcode = X86ISD::FMIN;
28838 // Converting this to a max would handle comparisons between positive
28839 // and negative zero incorrectly.
28840 if (!DAG.getTarget().Options.UnsafeFPMath &&
28841 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28843 Opcode = X86ISD::FMAX;
28846 // Converting this to a max would handle NaNs incorrectly, and swapping
28847 // the operands would cause it to handle comparisons between positive
28848 // and negative zero incorrectly.
28849 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28850 if (!DAG.getTarget().Options.UnsafeFPMath &&
28851 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28853 std::swap(LHS, RHS);
28855 Opcode = X86ISD::FMAX;
28858 // Converting this to a max would handle both negative zeros and NaNs
28859 // incorrectly, but we can swap the operands to fix both.
28860 std::swap(LHS, RHS);
28864 Opcode = X86ISD::FMAX;
28867 // Check for x CC y ? y : x -- a min/max with reversed arms.
28868 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
28869 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
28873 // Converting this to a min would handle comparisons between positive
28874 // and negative zero incorrectly, and swapping the operands would
28875 // cause it to handle NaNs incorrectly.
28876 if (!DAG.getTarget().Options.UnsafeFPMath &&
28877 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
28878 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28880 std::swap(LHS, RHS);
28882 Opcode = X86ISD::FMIN;
28885 // Converting this to a min would handle NaNs incorrectly.
28886 if (!DAG.getTarget().Options.UnsafeFPMath &&
28887 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
28889 Opcode = X86ISD::FMIN;
28892 // Converting this to a min would handle both negative zeros and NaNs
28893 // incorrectly, but we can swap the operands to fix both.
28894 std::swap(LHS, RHS);
28898 Opcode = X86ISD::FMIN;
28902 // Converting this to a max would handle NaNs incorrectly.
28903 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28905 Opcode = X86ISD::FMAX;
28908 // Converting this to a max would handle comparisons between positive
28909 // and negative zero incorrectly, and swapping the operands would
28910 // cause it to handle NaNs incorrectly.
28911 if (!DAG.getTarget().Options.UnsafeFPMath &&
28912 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
28913 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28915 std::swap(LHS, RHS);
28917 Opcode = X86ISD::FMAX;
28920 // Converting this to a max would handle both negative zeros and NaNs
28921 // incorrectly, but we can swap the operands to fix both.
28922 std::swap(LHS, RHS);
28926 Opcode = X86ISD::FMAX;
28932 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
28935 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
28936 // lowering on KNL. In this case we convert it to
28937 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
28938 // The same situation for all 128 and 256-bit vectors of i8 and i16.
28939 // Since SKX these selects have a proper lowering.
28940 if (Subtarget.hasAVX512() && CondVT.isVector() &&
28941 CondVT.getVectorElementType() == MVT::i1 &&
28942 (VT.is128BitVector() || VT.is256BitVector()) &&
28943 (VT.getVectorElementType() == MVT::i8 ||
28944 VT.getVectorElementType() == MVT::i16) &&
28945 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
28946 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
28947 DCI.AddToWorklist(Cond.getNode());
28948 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
28951 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
28954 // Canonicalize max and min:
28955 // (x > y) ? x : y -> (x >= y) ? x : y
28956 // (x < y) ? x : y -> (x <= y) ? x : y
28957 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
28958 // the need for an extra compare
28959 // against zero. e.g.
28960 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
28962 // testl %edi, %edi
28964 // cmovgl %edi, %eax
28968 // cmovsl %eax, %edi
28969 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
28970 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28971 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28972 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28977 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
28978 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
28979 Cond.getOperand(0), Cond.getOperand(1), NewCC);
28980 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
28985 // Early exit check
28986 if (!TLI.isTypeLegal(VT))
28989 // Match VSELECTs into subs with unsigned saturation.
28990 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
28991 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
28992 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
28993 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
28994 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28996 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
28997 // left side invert the predicate to simplify logic below.
28999 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
29001 CC = ISD::getSetCCInverse(CC, true);
29002 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
29006 if (Other.getNode() && Other->getNumOperands() == 2 &&
29007 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
29008 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
29009 SDValue CondRHS = Cond->getOperand(1);
29011 // Look for a general sub with unsigned saturation first.
29012 // x >= y ? x-y : 0 --> subus x, y
29013 // x > y ? x-y : 0 --> subus x, y
29014 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
29015 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
29016 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
29018 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
29019 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
29020 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
29021 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
29022 // If the RHS is a constant we have to reverse the const
29023 // canonicalization.
29024 // x > C-1 ? x+-C : 0 --> subus x, C
29025 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
29026 CondRHSConst->getAPIntValue() ==
29027 (-OpRHSConst->getAPIntValue() - 1))
29028 return DAG.getNode(
29029 X86ISD::SUBUS, DL, VT, OpLHS,
29030 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
29032 // Another special case: If C was a sign bit, the sub has been
29033 // canonicalized into a xor.
29034 // FIXME: Would it be better to use computeKnownBits to determine
29035 // whether it's safe to decanonicalize the xor?
29036 // x s< 0 ? x^C : 0 --> subus x, C
29037 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
29038 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
29039 OpRHSConst->getAPIntValue().isSignBit())
29040 // Note that we have to rebuild the RHS constant here to ensure we
29041 // don't rely on particular values of undef lanes.
29042 return DAG.getNode(
29043 X86ISD::SUBUS, DL, VT, OpLHS,
29044 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
29049 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
29052 // If this is a *dynamic* select (non-constant condition) and we can match
29053 // this node with one of the variable blend instructions, restructure the
29054 // condition so that the blends can use the high bit of each element and use
29055 // SimplifyDemandedBits to simplify the condition operand.
29056 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
29057 !DCI.isBeforeLegalize() &&
29058 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
29059 unsigned BitWidth = Cond.getScalarValueSizeInBits();
29061 // Don't optimize vector selects that map to mask-registers.
29065 // We can only handle the cases where VSELECT is directly legal on the
29066 // subtarget. We custom lower VSELECT nodes with constant conditions and
29067 // this makes it hard to see whether a dynamic VSELECT will correctly
29068 // lower, so we both check the operation's status and explicitly handle the
29069 // cases where a *dynamic* blend will fail even though a constant-condition
29070 // blend could be custom lowered.
29071 // FIXME: We should find a better way to handle this class of problems.
29072 // Potentially, we should combine constant-condition vselect nodes
29073 // pre-legalization into shuffles and not mark as many types as custom
29075 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
29077 // FIXME: We don't support i16-element blends currently. We could and
29078 // should support them by making *all* the bits in the condition be set
29079 // rather than just the high bit and using an i8-element blend.
29080 if (VT.getVectorElementType() == MVT::i16)
29082 // Dynamic blending was only available from SSE4.1 onward.
29083 if (VT.is128BitVector() && !Subtarget.hasSSE41())
29085 // Byte blends are only available in AVX2
29086 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
29089 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
29090 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
29092 APInt KnownZero, KnownOne;
29093 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
29094 DCI.isBeforeLegalizeOps());
29095 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
29096 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
29098 // If we changed the computation somewhere in the DAG, this change
29099 // will affect all users of Cond.
29100 // Make sure it is fine and update all the nodes so that we do not
29101 // use the generic VSELECT anymore. Otherwise, we may perform
29102 // wrong optimizations as we messed up with the actual expectation
29103 // for the vector boolean values.
29104 if (Cond != TLO.Old) {
29105 // Check all uses of that condition operand to check whether it will be
29106 // consumed by non-BLEND instructions, which may depend on all bits are
29108 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29110 if (I->getOpcode() != ISD::VSELECT)
29111 // TODO: Add other opcodes eventually lowered into BLEND.
29114 // Update all the users of the condition, before committing the change,
29115 // so that the VSELECT optimizations that expect the correct vector
29116 // boolean value will not be triggered.
29117 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29119 DAG.ReplaceAllUsesOfValueWith(
29121 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
29122 Cond, I->getOperand(1), I->getOperand(2)));
29123 DCI.CommitTargetLoweringOpt(TLO);
29126 // At this point, only Cond is changed. Change the condition
29127 // just for N to keep the opportunity to optimize all other
29128 // users their own way.
29129 DAG.ReplaceAllUsesOfValueWith(
29131 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
29132 TLO.New, N->getOperand(1), N->getOperand(2)));
29137 // Look for vselects with LHS/RHS being bitcasted from an operation that
29138 // can be executed on another type. Push the bitcast to the inputs of
29139 // the operation. This exposes opportunities for using masking instructions.
29140 if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
29141 CondVT.getVectorElementType() == MVT::i1) {
29142 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
29143 return SDValue(N, 0);
29144 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
29145 return SDValue(N, 0);
29152 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
29154 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
29155 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
29156 /// Note that this is only legal for some op/cc combinations.
29157 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
29158 SelectionDAG &DAG) {
29159 // This combine only operates on CMP-like nodes.
29160 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29161 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29164 // This only applies to variations of the common case:
29165 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
29166 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
29167 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
29168 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
29169 // Using the proper condcodes (see below), overflow is checked for.
29171 // FIXME: We can generalize both constraints:
29172 // - XOR/OR/AND (if they were made to survive AtomicExpand)
29174 // if the result is compared.
29176 SDValue CmpLHS = Cmp.getOperand(0);
29177 SDValue CmpRHS = Cmp.getOperand(1);
29179 if (!CmpLHS.hasOneUse())
29182 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
29183 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
29186 const unsigned Opc = CmpLHS.getOpcode();
29188 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
29191 SDValue OpRHS = CmpLHS.getOperand(2);
29192 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
29196 APInt Addend = OpRHSC->getAPIntValue();
29197 if (Opc == ISD::ATOMIC_LOAD_SUB)
29200 if (CC == X86::COND_S && Addend == 1)
29202 else if (CC == X86::COND_NS && Addend == 1)
29204 else if (CC == X86::COND_G && Addend == -1)
29206 else if (CC == X86::COND_LE && Addend == -1)
29211 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
29212 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
29213 DAG.getUNDEF(CmpLHS.getValueType()));
29214 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
29218 // Check whether a boolean test is testing a boolean value generated by
29219 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
29222 // Simplify the following patterns:
29223 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
29224 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
29225 // to (Op EFLAGS Cond)
29227 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
29228 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
29229 // to (Op EFLAGS !Cond)
29231 // where Op could be BRCOND or CMOV.
29233 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
29234 // This combine only operates on CMP-like nodes.
29235 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29236 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29239 // Quit if not used as a boolean value.
29240 if (CC != X86::COND_E && CC != X86::COND_NE)
29243 // Check CMP operands. One of them should be 0 or 1 and the other should be
29244 // an SetCC or extended from it.
29245 SDValue Op1 = Cmp.getOperand(0);
29246 SDValue Op2 = Cmp.getOperand(1);
29249 const ConstantSDNode* C = nullptr;
29250 bool needOppositeCond = (CC == X86::COND_E);
29251 bool checkAgainstTrue = false; // Is it a comparison against 1?
29253 if ((C = dyn_cast<ConstantSDNode>(Op1)))
29255 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
29257 else // Quit if all operands are not constants.
29260 if (C->getZExtValue() == 1) {
29261 needOppositeCond = !needOppositeCond;
29262 checkAgainstTrue = true;
29263 } else if (C->getZExtValue() != 0)
29264 // Quit if the constant is neither 0 or 1.
29267 bool truncatedToBoolWithAnd = false;
29268 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
29269 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
29270 SetCC.getOpcode() == ISD::TRUNCATE ||
29271 SetCC.getOpcode() == ISD::AND) {
29272 if (SetCC.getOpcode() == ISD::AND) {
29274 if (isOneConstant(SetCC.getOperand(0)))
29276 if (isOneConstant(SetCC.getOperand(1)))
29280 SetCC = SetCC.getOperand(OpIdx);
29281 truncatedToBoolWithAnd = true;
29283 SetCC = SetCC.getOperand(0);
29286 switch (SetCC.getOpcode()) {
29287 case X86ISD::SETCC_CARRY:
29288 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
29289 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
29290 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
29291 // truncated to i1 using 'and'.
29292 if (checkAgainstTrue && !truncatedToBoolWithAnd)
29294 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
29295 "Invalid use of SETCC_CARRY!");
29297 case X86ISD::SETCC:
29298 // Set the condition code or opposite one if necessary.
29299 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
29300 if (needOppositeCond)
29301 CC = X86::GetOppositeBranchCondition(CC);
29302 return SetCC.getOperand(1);
29303 case X86ISD::CMOV: {
29304 // Check whether false/true value has canonical one, i.e. 0 or 1.
29305 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
29306 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
29307 // Quit if true value is not a constant.
29310 // Quit if false value is not a constant.
29312 SDValue Op = SetCC.getOperand(0);
29313 // Skip 'zext' or 'trunc' node.
29314 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
29315 Op.getOpcode() == ISD::TRUNCATE)
29316 Op = Op.getOperand(0);
29317 // A special case for rdrand/rdseed, where 0 is set if false cond is
29319 if ((Op.getOpcode() != X86ISD::RDRAND &&
29320 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
29323 // Quit if false value is not the constant 0 or 1.
29324 bool FValIsFalse = true;
29325 if (FVal && FVal->getZExtValue() != 0) {
29326 if (FVal->getZExtValue() != 1)
29328 // If FVal is 1, opposite cond is needed.
29329 needOppositeCond = !needOppositeCond;
29330 FValIsFalse = false;
29332 // Quit if TVal is not the constant opposite of FVal.
29333 if (FValIsFalse && TVal->getZExtValue() != 1)
29335 if (!FValIsFalse && TVal->getZExtValue() != 0)
29337 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
29338 if (needOppositeCond)
29339 CC = X86::GetOppositeBranchCondition(CC);
29340 return SetCC.getOperand(3);
29347 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
29349 /// (X86or (X86setcc) (X86setcc))
29350 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
29351 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
29352 X86::CondCode &CC1, SDValue &Flags,
29354 if (Cond->getOpcode() == X86ISD::CMP) {
29355 if (!isNullConstant(Cond->getOperand(1)))
29358 Cond = Cond->getOperand(0);
29363 SDValue SetCC0, SetCC1;
29364 switch (Cond->getOpcode()) {
29365 default: return false;
29372 SetCC0 = Cond->getOperand(0);
29373 SetCC1 = Cond->getOperand(1);
29377 // Make sure we have SETCC nodes, using the same flags value.
29378 if (SetCC0.getOpcode() != X86ISD::SETCC ||
29379 SetCC1.getOpcode() != X86ISD::SETCC ||
29380 SetCC0->getOperand(1) != SetCC1->getOperand(1))
29383 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
29384 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
29385 Flags = SetCC0->getOperand(1);
29389 /// Optimize an EFLAGS definition used according to the condition code \p CC
29390 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
29391 /// uses of chain values.
29392 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
29393 SelectionDAG &DAG) {
29394 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
29396 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
29399 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
29400 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
29401 TargetLowering::DAGCombinerInfo &DCI,
29402 const X86Subtarget &Subtarget) {
29405 // If the flag operand isn't dead, don't touch this CMOV.
29406 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
29409 SDValue FalseOp = N->getOperand(0);
29410 SDValue TrueOp = N->getOperand(1);
29411 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
29412 SDValue Cond = N->getOperand(3);
29414 if (CC == X86::COND_E || CC == X86::COND_NE) {
29415 switch (Cond.getOpcode()) {
29419 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
29420 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
29421 return (CC == X86::COND_E) ? FalseOp : TrueOp;
29425 // Try to simplify the EFLAGS and condition code operands.
29426 // We can't always do this as FCMOV only supports a subset of X86 cond.
29427 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
29428 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
29429 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
29431 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29435 // If this is a select between two integer constants, try to do some
29436 // optimizations. Note that the operands are ordered the opposite of SELECT
29438 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
29439 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
29440 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
29441 // larger than FalseC (the false value).
29442 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
29443 CC = X86::GetOppositeBranchCondition(CC);
29444 std::swap(TrueC, FalseC);
29445 std::swap(TrueOp, FalseOp);
29448 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
29449 // This is efficient for any integer data type (including i8/i16) and
29451 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29452 Cond = getSETCC(CC, Cond, DL, DAG);
29454 // Zero extend the condition if needed.
29455 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
29457 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29458 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
29459 DAG.getConstant(ShAmt, DL, MVT::i8));
29460 if (N->getNumValues() == 2) // Dead flag value?
29461 return DCI.CombineTo(N, Cond, SDValue());
29465 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
29466 // for any integer data type, including i8/i16.
29467 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
29468 Cond = getSETCC(CC, Cond, DL, DAG);
29470 // Zero extend the condition if needed.
29471 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
29472 FalseC->getValueType(0), Cond);
29473 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29474 SDValue(FalseC, 0));
29476 if (N->getNumValues() == 2) // Dead flag value?
29477 return DCI.CombineTo(N, Cond, SDValue());
29481 // Optimize cases that will turn into an LEA instruction. This requires
29482 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29483 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29484 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
29485 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
29487 bool isFastMultiplier = false;
29489 switch ((unsigned char)Diff) {
29491 case 1: // result = add base, cond
29492 case 2: // result = lea base( , cond*2)
29493 case 3: // result = lea base(cond, cond*2)
29494 case 4: // result = lea base( , cond*4)
29495 case 5: // result = lea base(cond, cond*4)
29496 case 8: // result = lea base( , cond*8)
29497 case 9: // result = lea base(cond, cond*8)
29498 isFastMultiplier = true;
29503 if (isFastMultiplier) {
29504 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
29505 Cond = getSETCC(CC, Cond, DL ,DAG);
29506 // Zero extend the condition if needed.
29507 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
29509 // Scale the condition by the difference.
29511 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29512 DAG.getConstant(Diff, DL, Cond.getValueType()));
29514 // Add the base if non-zero.
29515 if (FalseC->getAPIntValue() != 0)
29516 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29517 SDValue(FalseC, 0));
29518 if (N->getNumValues() == 2) // Dead flag value?
29519 return DCI.CombineTo(N, Cond, SDValue());
29526 // Handle these cases:
29527 // (select (x != c), e, c) -> select (x != c), e, x),
29528 // (select (x == c), c, e) -> select (x == c), x, e)
29529 // where the c is an integer constant, and the "select" is the combination
29530 // of CMOV and CMP.
29532 // The rationale for this change is that the conditional-move from a constant
29533 // needs two instructions, however, conditional-move from a register needs
29534 // only one instruction.
29536 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
29537 // some instruction-combining opportunities. This opt needs to be
29538 // postponed as late as possible.
29540 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
29541 // the DCI.xxxx conditions are provided to postpone the optimization as
29542 // late as possible.
29544 ConstantSDNode *CmpAgainst = nullptr;
29545 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
29546 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
29547 !isa<ConstantSDNode>(Cond.getOperand(0))) {
29549 if (CC == X86::COND_NE &&
29550 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
29551 CC = X86::GetOppositeBranchCondition(CC);
29552 std::swap(TrueOp, FalseOp);
29555 if (CC == X86::COND_E &&
29556 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
29557 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
29558 DAG.getConstant(CC, DL, MVT::i8), Cond };
29559 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
29564 // Fold and/or of setcc's to double CMOV:
29565 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
29566 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
29568 // This combine lets us generate:
29569 // cmovcc1 (jcc1 if we don't have CMOV)
29575 // cmovne (jne if we don't have CMOV)
29576 // When we can't use the CMOV instruction, it might increase branch
29578 // When we can use CMOV, or when there is no mispredict, this improves
29579 // throughput and reduces register pressure.
29581 if (CC == X86::COND_NE) {
29583 X86::CondCode CC0, CC1;
29585 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
29587 std::swap(FalseOp, TrueOp);
29588 CC0 = X86::GetOppositeBranchCondition(CC0);
29589 CC1 = X86::GetOppositeBranchCondition(CC1);
29592 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
29594 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
29595 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
29596 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29597 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
29605 /// Different mul shrinking modes.
29606 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
29608 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
29609 EVT VT = N->getOperand(0).getValueType();
29610 if (VT.getScalarSizeInBits() != 32)
29613 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
29614 unsigned SignBits[2] = {1, 1};
29615 bool IsPositive[2] = {false, false};
29616 for (unsigned i = 0; i < 2; i++) {
29617 SDValue Opd = N->getOperand(i);
29619 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
29620 // compute signbits for it separately.
29621 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
29622 // For anyextend, it is safe to assume an appropriate number of leading
29624 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
29626 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
29631 IsPositive[i] = true;
29632 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
29633 // All the operands of BUILD_VECTOR need to be int constant.
29634 // Find the smallest value range which all the operands belong to.
29636 IsPositive[i] = true;
29637 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
29638 if (SubOp.isUndef())
29640 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
29643 APInt IntVal = CN->getAPIntValue();
29644 if (IntVal.isNegative())
29645 IsPositive[i] = false;
29646 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
29649 SignBits[i] = DAG.ComputeNumSignBits(Opd);
29650 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
29651 IsPositive[i] = true;
29655 bool AllPositive = IsPositive[0] && IsPositive[1];
29656 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
29657 // When ranges are from -128 ~ 127, use MULS8 mode.
29658 if (MinSignBits >= 25)
29660 // When ranges are from 0 ~ 255, use MULU8 mode.
29661 else if (AllPositive && MinSignBits >= 24)
29663 // When ranges are from -32768 ~ 32767, use MULS16 mode.
29664 else if (MinSignBits >= 17)
29666 // When ranges are from 0 ~ 65535, use MULU16 mode.
29667 else if (AllPositive && MinSignBits >= 16)
29674 /// When the operands of vector mul are extended from smaller size values,
29675 /// like i8 and i16, the type of mul may be shrinked to generate more
29676 /// efficient code. Two typical patterns are handled:
29678 /// %2 = sext/zext <N x i8> %1 to <N x i32>
29679 /// %4 = sext/zext <N x i8> %3 to <N x i32>
29680 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29681 /// %5 = mul <N x i32> %2, %4
29684 /// %2 = zext/sext <N x i16> %1 to <N x i32>
29685 /// %4 = zext/sext <N x i16> %3 to <N x i32>
29686 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29687 /// %5 = mul <N x i32> %2, %4
29689 /// There are four mul shrinking modes:
29690 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
29691 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
29692 /// generate pmullw+sext32 for it (MULS8 mode).
29693 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
29694 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
29695 /// generate pmullw+zext32 for it (MULU8 mode).
29696 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
29697 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
29698 /// generate pmullw+pmulhw for it (MULS16 mode).
29699 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
29700 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
29701 /// generate pmullw+pmulhuw for it (MULU16 mode).
29702 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
29703 const X86Subtarget &Subtarget) {
29704 // Check for legality
29705 // pmullw/pmulhw are not supported by SSE.
29706 if (!Subtarget.hasSSE2())
29709 // Check for profitability
29710 // pmulld is supported since SSE41. It is better to use pmulld
29711 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
29713 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
29714 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
29718 if (!canReduceVMulWidth(N, DAG, Mode))
29722 SDValue N0 = N->getOperand(0);
29723 SDValue N1 = N->getOperand(1);
29724 EVT VT = N->getOperand(0).getValueType();
29725 unsigned RegSize = 128;
29726 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
29728 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
29729 // Shrink the operands of mul.
29730 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
29731 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
29733 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
29734 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
29735 // lower part is needed.
29736 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
29737 if (Mode == MULU8 || Mode == MULS8) {
29738 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
29741 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29742 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
29743 // the higher part is also needed.
29744 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29745 ReducedVT, NewN0, NewN1);
29747 // Repack the lower part and higher part result of mul into a wider
29749 // Generate shuffle functioning as punpcklwd.
29750 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
29751 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29752 ShuffleMask[2 * i] = i;
29753 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
29756 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29757 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
29758 // Generate shuffle functioning as punpckhwd.
29759 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29760 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
29761 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
29764 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29765 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
29766 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
29769 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
29770 // to legalize the mul explicitly because implicit legalization for type
29771 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
29772 // instructions which will not exist when we explicitly legalize it by
29773 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
29774 // <4 x i16> undef).
29776 // Legalize the operands of mul.
29777 // FIXME: We may be able to handle non-concatenated vectors by insertion.
29778 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
29779 if ((RegSize % ReducedSizeInBits) != 0)
29782 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
29783 DAG.getUNDEF(ReducedVT));
29785 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29787 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29789 if (Mode == MULU8 || Mode == MULS8) {
29790 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
29792 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29794 // convert the type of mul result to VT.
29795 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29796 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
29797 : ISD::SIGN_EXTEND_VECTOR_INREG,
29799 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29800 DAG.getIntPtrConstant(0, DL));
29802 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
29803 // MULU16/MULS16, both parts are needed.
29804 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29805 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29806 OpsVT, NewN0, NewN1);
29808 // Repack the lower part and higher part result of mul into a wider
29809 // result. Make sure the type of mul result is VT.
29810 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29811 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
29812 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
29813 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29814 DAG.getIntPtrConstant(0, DL));
29819 /// Optimize a single multiply with constant into two operations in order to
29820 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
29821 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
29822 TargetLowering::DAGCombinerInfo &DCI,
29823 const X86Subtarget &Subtarget) {
29824 EVT VT = N->getValueType(0);
29825 if (DCI.isBeforeLegalize() && VT.isVector())
29826 return reduceVMULWidth(N, DAG, Subtarget);
29828 // An imul is usually smaller than the alternative sequence.
29829 if (DAG.getMachineFunction().getFunction()->optForMinSize())
29832 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
29835 if (VT != MVT::i64 && VT != MVT::i32)
29838 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
29841 uint64_t MulAmt = C->getZExtValue();
29842 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
29845 uint64_t MulAmt1 = 0;
29846 uint64_t MulAmt2 = 0;
29847 if ((MulAmt % 9) == 0) {
29849 MulAmt2 = MulAmt / 9;
29850 } else if ((MulAmt % 5) == 0) {
29852 MulAmt2 = MulAmt / 5;
29853 } else if ((MulAmt % 3) == 0) {
29855 MulAmt2 = MulAmt / 3;
29861 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
29863 if (isPowerOf2_64(MulAmt2) &&
29864 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
29865 // If second multiplifer is pow2, issue it first. We want the multiply by
29866 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
29868 std::swap(MulAmt1, MulAmt2);
29870 if (isPowerOf2_64(MulAmt1))
29871 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
29872 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
29874 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
29875 DAG.getConstant(MulAmt1, DL, VT));
29877 if (isPowerOf2_64(MulAmt2))
29878 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
29879 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
29881 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
29882 DAG.getConstant(MulAmt2, DL, VT));
29886 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
29887 && "Both cases that could cause potential overflows should have "
29888 "already been handled.");
29889 if (isPowerOf2_64(MulAmt - 1))
29890 // (mul x, 2^N + 1) => (add (shl x, N), x)
29891 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
29892 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
29893 DAG.getConstant(Log2_64(MulAmt - 1), DL,
29896 else if (isPowerOf2_64(MulAmt + 1))
29897 // (mul x, 2^N - 1) => (sub (shl x, N), x)
29898 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
29900 DAG.getConstant(Log2_64(MulAmt + 1),
29901 DL, MVT::i8)), N->getOperand(0));
29905 // Do not add new nodes to DAG combiner worklist.
29906 DCI.CombineTo(N, NewMul, false);
29911 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
29912 SDValue N0 = N->getOperand(0);
29913 SDValue N1 = N->getOperand(1);
29914 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
29915 EVT VT = N0.getValueType();
29917 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
29918 // since the result of setcc_c is all zero's or all ones.
29919 if (VT.isInteger() && !VT.isVector() &&
29920 N1C && N0.getOpcode() == ISD::AND &&
29921 N0.getOperand(1).getOpcode() == ISD::Constant) {
29922 SDValue N00 = N0.getOperand(0);
29923 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
29924 const APInt &ShAmt = N1C->getAPIntValue();
29925 Mask = Mask.shl(ShAmt);
29926 bool MaskOK = false;
29927 // We can handle cases concerning bit-widening nodes containing setcc_c if
29928 // we carefully interrogate the mask to make sure we are semantics
29930 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
29931 // of the underlying setcc_c operation if the setcc_c was zero extended.
29932 // Consider the following example:
29933 // zext(setcc_c) -> i32 0x0000FFFF
29934 // c1 -> i32 0x0000FFFF
29935 // c2 -> i32 0x00000001
29936 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
29937 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
29938 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
29940 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
29941 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
29943 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
29944 N00.getOpcode() == ISD::ANY_EXTEND) &&
29945 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
29946 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
29948 if (MaskOK && Mask != 0) {
29950 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
29954 // Hardware support for vector shifts is sparse which makes us scalarize the
29955 // vector operations in many cases. Also, on sandybridge ADD is faster than
29957 // (shl V, 1) -> add V,V
29958 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
29959 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
29960 assert(N0.getValueType().isVector() && "Invalid vector shift type");
29961 // We shift all of the values by one. In many cases we do not have
29962 // hardware support for this operation. This is better expressed as an ADD
29964 if (N1SplatC->getAPIntValue() == 1)
29965 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
29971 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
29972 SDValue N0 = N->getOperand(0);
29973 SDValue N1 = N->getOperand(1);
29974 EVT VT = N0.getValueType();
29975 unsigned Size = VT.getSizeInBits();
29977 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
29978 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
29979 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
29980 // depending on sign of (SarConst - [56,48,32,24,16])
29982 // sexts in X86 are MOVs. The MOVs have the same code size
29983 // as above SHIFTs (only SHIFT on 1 has lower code size).
29984 // However the MOVs have 2 advantages to a SHIFT:
29985 // 1. MOVs can write to a register that differs from source
29986 // 2. MOVs accept memory operands
29988 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
29989 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
29990 N0.getOperand(1).getOpcode() != ISD::Constant)
29993 SDValue N00 = N0.getOperand(0);
29994 SDValue N01 = N0.getOperand(1);
29995 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
29996 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
29997 EVT CVT = N1.getValueType();
29999 if (SarConst.isNegative())
30002 for (MVT SVT : MVT::integer_valuetypes()) {
30003 unsigned ShiftSize = SVT.getSizeInBits();
30004 // skipping types without corresponding sext/zext and
30005 // ShlConst that is not one of [56,48,32,24,16]
30006 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
30010 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
30011 SarConst = SarConst - (Size - ShiftSize);
30014 else if (SarConst.isNegative())
30015 return DAG.getNode(ISD::SHL, DL, VT, NN,
30016 DAG.getConstant(-SarConst, DL, CVT));
30018 return DAG.getNode(ISD::SRA, DL, VT, NN,
30019 DAG.getConstant(SarConst, DL, CVT));
30024 /// \brief Returns a vector of 0s if the node in input is a vector logical
30025 /// shift by a constant amount which is known to be bigger than or equal
30026 /// to the vector element size in bits.
30027 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
30028 const X86Subtarget &Subtarget) {
30029 EVT VT = N->getValueType(0);
30031 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
30032 (!Subtarget.hasInt256() ||
30033 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
30036 SDValue Amt = N->getOperand(1);
30038 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
30039 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
30040 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
30041 unsigned MaxAmount =
30042 VT.getSimpleVT().getScalarSizeInBits();
30044 // SSE2/AVX2 logical shifts always return a vector of 0s
30045 // if the shift amount is bigger than or equal to
30046 // the element size. The constant shift amount will be
30047 // encoded as a 8-bit immediate.
30048 if (ShiftAmt.trunc(8).uge(MaxAmount))
30049 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
30055 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
30056 TargetLowering::DAGCombinerInfo &DCI,
30057 const X86Subtarget &Subtarget) {
30058 if (N->getOpcode() == ISD::SHL)
30059 if (SDValue V = combineShiftLeft(N, DAG))
30062 if (N->getOpcode() == ISD::SRA)
30063 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
30066 // Try to fold this logical shift into a zero vector.
30067 if (N->getOpcode() != ISD::SRA)
30068 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
30074 static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
30075 TargetLowering::DAGCombinerInfo &DCI,
30076 const X86Subtarget &Subtarget) {
30077 assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
30078 "Unexpected opcode");
30079 EVT VT = N->getValueType(0);
30080 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
30082 // This fails for mask register (vXi1) shifts.
30083 if ((NumBitsPerElt % 8) != 0)
30086 // Out of range logical bit shifts are guaranteed to be zero.
30087 APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
30088 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
30089 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30091 // Shift N0 by zero -> N0.
30093 return N->getOperand(0);
30095 // Shift zero -> zero.
30096 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
30097 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30099 // We can decode 'whole byte' logical bit shifts as shuffles.
30100 if ((ShiftVal.getZExtValue() % 8) == 0) {
30102 SmallVector<int, 1> NonceMask; // Just a placeholder.
30103 NonceMask.push_back(0);
30104 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30105 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30107 return SDValue(); // This routine will use CombineTo to replace N.
30113 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
30114 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
30115 /// OR -> CMPNEQSS.
30116 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
30117 TargetLowering::DAGCombinerInfo &DCI,
30118 const X86Subtarget &Subtarget) {
30121 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
30122 // we're requiring SSE2 for both.
30123 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
30124 SDValue N0 = N->getOperand(0);
30125 SDValue N1 = N->getOperand(1);
30126 SDValue CMP0 = N0->getOperand(1);
30127 SDValue CMP1 = N1->getOperand(1);
30130 // The SETCCs should both refer to the same CMP.
30131 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
30134 SDValue CMP00 = CMP0->getOperand(0);
30135 SDValue CMP01 = CMP0->getOperand(1);
30136 EVT VT = CMP00.getValueType();
30138 if (VT == MVT::f32 || VT == MVT::f64) {
30139 bool ExpectingFlags = false;
30140 // Check for any users that want flags:
30141 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
30142 !ExpectingFlags && UI != UE; ++UI)
30143 switch (UI->getOpcode()) {
30148 ExpectingFlags = true;
30150 case ISD::CopyToReg:
30151 case ISD::SIGN_EXTEND:
30152 case ISD::ZERO_EXTEND:
30153 case ISD::ANY_EXTEND:
30157 if (!ExpectingFlags) {
30158 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
30159 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
30161 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
30162 X86::CondCode tmp = cc0;
30167 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
30168 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
30169 // FIXME: need symbolic constants for these magic numbers.
30170 // See X86ATTInstPrinter.cpp:printSSECC().
30171 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
30172 if (Subtarget.hasAVX512()) {
30173 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
30175 DAG.getConstant(x86cc, DL, MVT::i8));
30176 if (N->getValueType(0) != MVT::i1)
30177 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
30181 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
30182 CMP00.getValueType(), CMP00, CMP01,
30183 DAG.getConstant(x86cc, DL,
30186 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
30187 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
30189 if (is64BitFP && !Subtarget.is64Bit()) {
30190 // On a 32-bit target, we cannot bitcast the 64-bit float to a
30191 // 64-bit integer, since that's not a legal type. Since
30192 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
30193 // bits, but can do this little dance to extract the lowest 32 bits
30194 // and work with those going forward.
30195 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
30197 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
30198 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
30199 Vector32, DAG.getIntPtrConstant(0, DL));
30203 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
30204 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
30205 DAG.getConstant(1, DL, IntVT));
30206 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
30208 return OneBitOfTruth;
30216 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
30217 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
30218 assert(N->getOpcode() == ISD::AND);
30220 EVT VT = N->getValueType(0);
30221 SDValue N0 = N->getOperand(0);
30222 SDValue N1 = N->getOperand(1);
30225 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
30228 // Canonicalize XOR to the left.
30229 if (N1.getOpcode() == ISD::XOR)
30232 if (N0.getOpcode() != ISD::XOR)
30235 SDValue N00 = N0->getOperand(0);
30236 SDValue N01 = N0->getOperand(1);
30238 N01 = peekThroughBitcasts(N01);
30240 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
30241 // insert_subvector building a 256-bit AllOnes vector.
30242 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
30243 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
30246 SDValue V1 = N01->getOperand(0);
30247 SDValue V2 = N01->getOperand(1);
30248 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
30249 !V1.getOperand(0).isUndef() ||
30250 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
30251 !ISD::isBuildVectorAllOnes(V2.getNode()))
30254 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
30257 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
30258 // register. In most cases we actually compare or select YMM-sized registers
30259 // and mixing the two types creates horrible code. This method optimizes
30260 // some of the transition sequences.
30261 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30262 TargetLowering::DAGCombinerInfo &DCI,
30263 const X86Subtarget &Subtarget) {
30264 EVT VT = N->getValueType(0);
30265 if (!VT.is256BitVector())
30268 assert((N->getOpcode() == ISD::ANY_EXTEND ||
30269 N->getOpcode() == ISD::ZERO_EXTEND ||
30270 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
30272 SDValue Narrow = N->getOperand(0);
30273 EVT NarrowVT = Narrow->getValueType(0);
30274 if (!NarrowVT.is128BitVector())
30277 if (Narrow->getOpcode() != ISD::XOR &&
30278 Narrow->getOpcode() != ISD::AND &&
30279 Narrow->getOpcode() != ISD::OR)
30282 SDValue N0 = Narrow->getOperand(0);
30283 SDValue N1 = Narrow->getOperand(1);
30286 // The Left side has to be a trunc.
30287 if (N0.getOpcode() != ISD::TRUNCATE)
30290 // The type of the truncated inputs.
30291 EVT WideVT = N0->getOperand(0)->getValueType(0);
30295 // The right side has to be a 'trunc' or a constant vector.
30296 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
30297 ConstantSDNode *RHSConstSplat = nullptr;
30298 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
30299 RHSConstSplat = RHSBV->getConstantSplatNode();
30300 if (!RHSTrunc && !RHSConstSplat)
30303 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30305 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
30308 // Set N0 and N1 to hold the inputs to the new wide operation.
30309 N0 = N0->getOperand(0);
30310 if (RHSConstSplat) {
30311 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
30312 SDValue(RHSConstSplat, 0));
30313 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
30314 } else if (RHSTrunc) {
30315 N1 = N1->getOperand(0);
30318 // Generate the wide operation.
30319 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
30320 unsigned Opcode = N->getOpcode();
30322 case ISD::ANY_EXTEND:
30324 case ISD::ZERO_EXTEND: {
30325 unsigned InBits = NarrowVT.getScalarSizeInBits();
30326 APInt Mask = APInt::getAllOnesValue(InBits);
30327 Mask = Mask.zext(VT.getScalarSizeInBits());
30328 return DAG.getNode(ISD::AND, DL, VT,
30329 Op, DAG.getConstant(Mask, DL, VT));
30331 case ISD::SIGN_EXTEND:
30332 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
30333 Op, DAG.getValueType(NarrowVT));
30335 llvm_unreachable("Unexpected opcode");
30339 /// If both input operands of a logic op are being cast from floating point
30340 /// types, try to convert this into a floating point logic node to avoid
30341 /// unnecessary moves from SSE to integer registers.
30342 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
30343 const X86Subtarget &Subtarget) {
30344 unsigned FPOpcode = ISD::DELETED_NODE;
30345 if (N->getOpcode() == ISD::AND)
30346 FPOpcode = X86ISD::FAND;
30347 else if (N->getOpcode() == ISD::OR)
30348 FPOpcode = X86ISD::FOR;
30349 else if (N->getOpcode() == ISD::XOR)
30350 FPOpcode = X86ISD::FXOR;
30352 assert(FPOpcode != ISD::DELETED_NODE &&
30353 "Unexpected input node for FP logic conversion");
30355 EVT VT = N->getValueType(0);
30356 SDValue N0 = N->getOperand(0);
30357 SDValue N1 = N->getOperand(1);
30359 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
30360 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
30361 (Subtarget.hasSSE2() && VT == MVT::i64))) {
30362 SDValue N00 = N0.getOperand(0);
30363 SDValue N10 = N1.getOperand(0);
30364 EVT N00Type = N00.getValueType();
30365 EVT N10Type = N10.getValueType();
30366 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
30367 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
30368 return DAG.getBitcast(VT, FPLogic);
30374 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
30375 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
30376 /// eliminate loading the vector constant mask value. This relies on the fact
30377 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
30378 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
30379 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
30380 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
30382 // TODO: Use AssertSext to mark any nodes that have the property of producing
30383 // all-ones or all-zeros. Then check for that node rather than particular
30385 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
30388 // The existence of the PCMP node guarantees that we have the required SSE2 or
30389 // AVX2 for a shift of this vector type, but there is no vector shift by
30390 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
30391 // masked compare nodes, so they should not make it here.
30392 EVT VT0 = Op0.getValueType();
30393 EVT VT1 = Op1.getValueType();
30394 unsigned EltBitWidth = VT0.getScalarSizeInBits();
30395 if (VT0 != VT1 || EltBitWidth == 8)
30398 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
30401 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
30405 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
30406 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
30407 return DAG.getBitcast(N->getValueType(0), Shift);
30410 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
30411 TargetLowering::DAGCombinerInfo &DCI,
30412 const X86Subtarget &Subtarget) {
30413 if (DCI.isBeforeLegalizeOps())
30416 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30419 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30422 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
30425 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
30428 EVT VT = N->getValueType(0);
30429 SDValue N0 = N->getOperand(0);
30430 SDValue N1 = N->getOperand(1);
30433 // Attempt to recursively combine a bitmask AND with shuffles.
30434 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
30436 SmallVector<int, 1> NonceMask; // Just a placeholder.
30437 NonceMask.push_back(0);
30438 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30439 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30441 return SDValue(); // This routine will use CombineTo to replace N.
30444 // Create BEXTR instructions
30445 // BEXTR is ((X >> imm) & (2**size-1))
30446 if (VT != MVT::i32 && VT != MVT::i64)
30449 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
30451 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
30454 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
30455 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
30456 if (MaskNode && ShiftNode) {
30457 uint64_t Mask = MaskNode->getZExtValue();
30458 uint64_t Shift = ShiftNode->getZExtValue();
30459 if (isMask_64(Mask)) {
30460 uint64_t MaskSize = countPopulation(Mask);
30461 if (Shift + MaskSize <= VT.getSizeInBits())
30462 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
30463 DAG.getConstant(Shift | (MaskSize << 8), DL,
30471 // (or (and (m, y), (pandn m, x)))
30473 // (vselect m, x, y)
30474 // As a special case, try to fold:
30475 // (or (and (m, (sub 0, x)), (pandn m, x)))
30477 // (sub (xor X, M), M)
30478 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
30479 const X86Subtarget &Subtarget) {
30480 assert(N->getOpcode() == ISD::OR);
30482 SDValue N0 = N->getOperand(0);
30483 SDValue N1 = N->getOperand(1);
30484 EVT VT = N->getValueType(0);
30486 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
30488 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
30490 // Canonicalize pandn to RHS
30491 if (N0.getOpcode() == X86ISD::ANDNP)
30494 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
30497 SDValue Mask = N1.getOperand(0);
30498 SDValue X = N1.getOperand(1);
30500 if (N0.getOperand(0) == Mask)
30501 Y = N0.getOperand(1);
30502 if (N0.getOperand(1) == Mask)
30503 Y = N0.getOperand(0);
30505 // Check to see if the mask appeared in both the AND and ANDNP.
30509 // Validate that X, Y, and Mask are bitcasts, and see through them.
30510 Mask = peekThroughBitcasts(Mask);
30511 X = peekThroughBitcasts(X);
30512 Y = peekThroughBitcasts(Y);
30514 EVT MaskVT = Mask.getValueType();
30516 // Validate that the Mask operand is a vector sra node.
30517 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
30518 // there is no psrai.b
30519 unsigned EltBits = MaskVT.getScalarSizeInBits();
30520 unsigned SraAmt = ~0;
30521 if (Mask.getOpcode() == ISD::SRA) {
30522 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
30523 if (auto *AmtConst = AmtBV->getConstantSplatNode())
30524 SraAmt = AmtConst->getZExtValue();
30525 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
30526 SDValue SraC = Mask.getOperand(1);
30527 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
30529 if ((SraAmt + 1) != EltBits)
30535 // (or (and (M, (sub 0, X)), (pandn M, X)))
30536 // which is a special case of vselect:
30537 // (vselect M, (sub 0, X), X)
30539 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
30540 // We know that, if fNegate is 0 or 1:
30541 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
30543 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
30544 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
30545 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
30546 // This lets us transform our vselect to:
30547 // (add (xor X, M), (and M, 1))
30549 // (sub (xor X, M), M)
30550 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
30551 auto IsNegV = [](SDNode *N, SDValue V) {
30552 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
30553 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
30556 if (IsNegV(Y.getNode(), X))
30558 else if (IsNegV(X.getNode(), Y))
30562 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
30563 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
30564 SDValue SubOp2 = Mask;
30566 // If the negate was on the false side of the select, then
30567 // the operands of the SUB need to be swapped. PR 27251.
30568 // This is because the pattern being matched above is
30569 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
30570 // but if the pattern matched was
30571 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
30572 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
30573 // pattern also needs to be a negation of the replacement pattern above.
30574 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
30575 // sub accomplishes the negation of the replacement pattern.
30577 std::swap(SubOp1, SubOp2);
30579 return DAG.getBitcast(VT,
30580 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
30584 // PBLENDVB is only available on SSE 4.1.
30585 if (!Subtarget.hasSSE41())
30588 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
30590 X = DAG.getBitcast(BlendVT, X);
30591 Y = DAG.getBitcast(BlendVT, Y);
30592 Mask = DAG.getBitcast(BlendVT, Mask);
30593 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
30594 return DAG.getBitcast(VT, Mask);
30597 // Helper function for combineOrCmpEqZeroToCtlzSrl
30601 // srl(ctlz x), log2(bitsize(x))
30602 // Input pattern is checked by caller.
30603 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
30604 SelectionDAG &DAG) {
30605 SDValue Cmp = Op.getOperand(1);
30606 EVT VT = Cmp.getOperand(0).getValueType();
30607 unsigned Log2b = Log2_32(VT.getSizeInBits());
30609 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
30610 // The result of the shift is true or false, and on X86, the 32-bit
30611 // encoding of shr and lzcnt is more desirable.
30612 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
30613 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
30614 DAG.getConstant(Log2b, dl, VT));
30615 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
30618 // Try to transform:
30619 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
30621 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
30622 // Will also attempt to match more generic cases, eg:
30623 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
30624 // Only applies if the target supports the FastLZCNT feature.
30625 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
30626 TargetLowering::DAGCombinerInfo &DCI,
30627 const X86Subtarget &Subtarget) {
30628 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
30631 auto isORCandidate = [](SDValue N) {
30632 return (N->getOpcode() == ISD::OR && N->hasOneUse());
30635 // Check the zero extend is extending to 32-bit or more. The code generated by
30636 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
30637 // instructions to clear the upper bits.
30638 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
30639 !isORCandidate(N->getOperand(0)))
30642 // Check the node matches: setcc(eq, cmp 0)
30643 auto isSetCCCandidate = [](SDValue N) {
30644 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
30645 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
30646 N->getOperand(1).getOpcode() == X86ISD::CMP &&
30647 N->getOperand(1).getConstantOperandVal(1) == 0 &&
30648 N->getOperand(1).getValueType().bitsGE(MVT::i32);
30651 SDNode *OR = N->getOperand(0).getNode();
30652 SDValue LHS = OR->getOperand(0);
30653 SDValue RHS = OR->getOperand(1);
30655 // Save nodes matching or(or, setcc(eq, cmp 0)).
30656 SmallVector<SDNode *, 2> ORNodes;
30657 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
30658 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
30659 ORNodes.push_back(OR);
30660 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
30661 LHS = OR->getOperand(0);
30662 RHS = OR->getOperand(1);
30665 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
30666 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
30667 !isORCandidate(SDValue(OR, 0)))
30670 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
30672 // or(srl(ctlz),srl(ctlz)).
30673 // The dag combiner can then fold it into:
30674 // srl(or(ctlz, ctlz)).
30675 EVT VT = OR->getValueType(0);
30676 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
30677 SDValue Ret, NewRHS;
30678 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
30679 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
30684 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
30685 while (ORNodes.size() > 0) {
30686 OR = ORNodes.pop_back_val();
30687 LHS = OR->getOperand(0);
30688 RHS = OR->getOperand(1);
30689 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
30690 if (RHS->getOpcode() == ISD::OR)
30691 std::swap(LHS, RHS);
30692 EVT VT = OR->getValueType(0);
30693 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
30696 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
30700 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
30705 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
30706 TargetLowering::DAGCombinerInfo &DCI,
30707 const X86Subtarget &Subtarget) {
30708 if (DCI.isBeforeLegalizeOps())
30711 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30714 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30717 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
30720 SDValue N0 = N->getOperand(0);
30721 SDValue N1 = N->getOperand(1);
30722 EVT VT = N->getValueType(0);
30724 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
30727 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
30728 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
30730 // SHLD/SHRD instructions have lower register pressure, but on some
30731 // platforms they have higher latency than the equivalent
30732 // series of shifts/or that would otherwise be generated.
30733 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
30734 // have higher latencies and we are not optimizing for size.
30735 if (!OptForSize && Subtarget.isSHLDSlow())
30738 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
30740 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
30742 if (!N0.hasOneUse() || !N1.hasOneUse())
30745 SDValue ShAmt0 = N0.getOperand(1);
30746 if (ShAmt0.getValueType() != MVT::i8)
30748 SDValue ShAmt1 = N1.getOperand(1);
30749 if (ShAmt1.getValueType() != MVT::i8)
30751 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
30752 ShAmt0 = ShAmt0.getOperand(0);
30753 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
30754 ShAmt1 = ShAmt1.getOperand(0);
30757 unsigned Opc = X86ISD::SHLD;
30758 SDValue Op0 = N0.getOperand(0);
30759 SDValue Op1 = N1.getOperand(0);
30760 if (ShAmt0.getOpcode() == ISD::SUB ||
30761 ShAmt0.getOpcode() == ISD::XOR) {
30762 Opc = X86ISD::SHRD;
30763 std::swap(Op0, Op1);
30764 std::swap(ShAmt0, ShAmt1);
30767 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
30768 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
30769 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
30770 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
30771 unsigned Bits = VT.getSizeInBits();
30772 if (ShAmt1.getOpcode() == ISD::SUB) {
30773 SDValue Sum = ShAmt1.getOperand(0);
30774 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
30775 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
30776 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
30777 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
30778 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
30779 return DAG.getNode(Opc, DL, VT,
30781 DAG.getNode(ISD::TRUNCATE, DL,
30784 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
30785 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
30786 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
30787 return DAG.getNode(Opc, DL, VT,
30788 N0.getOperand(0), N1.getOperand(0),
30789 DAG.getNode(ISD::TRUNCATE, DL,
30791 } else if (ShAmt1.getOpcode() == ISD::XOR) {
30792 SDValue Mask = ShAmt1.getOperand(1);
30793 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
30794 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
30795 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
30796 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
30797 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
30798 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
30799 if (Op1.getOpcode() == InnerShift &&
30800 isa<ConstantSDNode>(Op1.getOperand(1)) &&
30801 Op1.getConstantOperandVal(1) == 1) {
30802 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30803 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30805 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
30806 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
30807 Op1.getOperand(0) == Op1.getOperand(1)) {
30808 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30809 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30818 /// Generate NEG and CMOV for integer abs.
30819 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
30820 EVT VT = N->getValueType(0);
30822 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30823 // 8-bit integer abs to NEG and CMOV.
30824 if (VT.isInteger() && VT.getSizeInBits() == 8)
30827 SDValue N0 = N->getOperand(0);
30828 SDValue N1 = N->getOperand(1);
30831 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
30832 // and change it to SUB and CMOV.
30833 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
30834 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
30835 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
30836 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
30837 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
30838 // Generate SUB & CMOV.
30839 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30840 DAG.getConstant(0, DL, VT), N0.getOperand(0));
30841 SDValue Ops[] = {N0.getOperand(0), Neg,
30842 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
30843 SDValue(Neg.getNode(), 1)};
30844 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
30850 /// Try to turn tests against the signbit in the form of:
30851 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
30854 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
30855 // This is only worth doing if the output type is i8 or i1.
30856 EVT ResultType = N->getValueType(0);
30857 if (ResultType != MVT::i8 && ResultType != MVT::i1)
30860 SDValue N0 = N->getOperand(0);
30861 SDValue N1 = N->getOperand(1);
30863 // We should be performing an xor against a truncated shift.
30864 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
30867 // Make sure we are performing an xor against one.
30868 if (!isOneConstant(N1))
30871 // SetCC on x86 zero extends so only act on this if it's a logical shift.
30872 SDValue Shift = N0.getOperand(0);
30873 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
30876 // Make sure we are truncating from one of i16, i32 or i64.
30877 EVT ShiftTy = Shift.getValueType();
30878 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
30881 // Make sure the shift amount extracts the sign bit.
30882 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
30883 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
30886 // Create a greater-than comparison against -1.
30887 // N.B. Using SETGE against 0 works but we want a canonical looking
30888 // comparison, using SETGT matches up with what TranslateX86CC.
30890 SDValue ShiftOp = Shift.getOperand(0);
30891 EVT ShiftOpTy = ShiftOp.getValueType();
30892 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30893 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
30894 *DAG.getContext(), ResultType);
30895 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
30896 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
30897 if (SetCCResultType != ResultType)
30898 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
30902 /// Turn vector tests of the signbit in the form of:
30903 /// xor (sra X, elt_size(X)-1), -1
30907 /// This should be called before type legalization because the pattern may not
30908 /// persist after that.
30909 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
30910 const X86Subtarget &Subtarget) {
30911 EVT VT = N->getValueType(0);
30912 if (!VT.isSimple())
30915 switch (VT.getSimpleVT().SimpleTy) {
30916 default: return SDValue();
30919 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
30920 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
30924 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
30927 // There must be a shift right algebraic before the xor, and the xor must be a
30928 // 'not' operation.
30929 SDValue Shift = N->getOperand(0);
30930 SDValue Ones = N->getOperand(1);
30931 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
30932 !ISD::isBuildVectorAllOnes(Ones.getNode()))
30935 // The shift should be smearing the sign bit across each vector element.
30936 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
30940 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
30941 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
30942 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
30945 // Create a greater-than comparison against -1. We don't use the more obvious
30946 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
30947 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
30950 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
30951 /// is valid for the given \p Subtarget.
30953 isSATValidOnSubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
30954 if (!Subtarget.hasAVX512())
30956 EVT SrcElVT = SrcVT.getScalarType();
30957 EVT DstElVT = DstVT.getScalarType();
30958 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
30960 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
30962 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
30963 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
30967 /// Detect a pattern of truncation with saturation:
30968 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
30969 /// Return the source value to be truncated or SDValue() if the pattern was not
30970 /// matched or the unsupported on the current target.
30972 detectUSatPattern(SDValue In, EVT VT, const X86Subtarget &Subtarget) {
30973 if (In.getOpcode() != ISD::UMIN)
30976 EVT InVT = In.getValueType();
30977 // FIXME: Scalar type may be supported if we move it to vector register.
30978 if (!InVT.isVector() || !InVT.isSimple())
30981 if (!isSATValidOnSubtarget(InVT, VT, Subtarget))
30984 //Saturation with truncation. We truncate from InVT to VT.
30985 assert(InVT.getScalarSizeInBits() > VT.getScalarSizeInBits() &&
30986 "Unexpected types for truncate operation");
30990 if (ISD::isConstantSplatVector(In.getOperand(0).getNode(), C))
30991 SrcVal = In.getOperand(1);
30992 else if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C))
30993 SrcVal = In.getOperand(0);
30997 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
30998 // the element size of the destination type.
30999 return (C == ((uint64_t)1 << VT.getScalarSizeInBits()) - 1) ?
31000 SrcVal : SDValue();
31003 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
31004 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
31005 /// X86ISD::AVG instruction.
31006 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
31007 const X86Subtarget &Subtarget,
31009 if (!VT.isVector() || !VT.isSimple())
31011 EVT InVT = In.getValueType();
31012 unsigned NumElems = VT.getVectorNumElements();
31014 EVT ScalarVT = VT.getVectorElementType();
31015 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
31016 isPowerOf2_32(NumElems)))
31019 // InScalarVT is the intermediate type in AVG pattern and it should be greater
31020 // than the original input type (i8/i16).
31021 EVT InScalarVT = InVT.getVectorElementType();
31022 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
31025 if (!Subtarget.hasSSE2())
31027 if (Subtarget.hasBWI()) {
31028 if (VT.getSizeInBits() > 512)
31030 } else if (Subtarget.hasAVX2()) {
31031 if (VT.getSizeInBits() > 256)
31034 if (VT.getSizeInBits() > 128)
31038 // Detect the following pattern:
31040 // %1 = zext <N x i8> %a to <N x i32>
31041 // %2 = zext <N x i8> %b to <N x i32>
31042 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
31043 // %4 = add nuw nsw <N x i32> %3, %2
31044 // %5 = lshr <N x i32> %N, <i32 1 x N>
31045 // %6 = trunc <N x i32> %5 to <N x i8>
31047 // In AVX512, the last instruction can also be a trunc store.
31049 if (In.getOpcode() != ISD::SRL)
31052 // A lambda checking the given SDValue is a constant vector and each element
31053 // is in the range [Min, Max].
31054 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
31055 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
31056 if (!BV || !BV->isConstant())
31058 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
31059 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
31062 uint64_t Val = C->getZExtValue();
31063 if (Val < Min || Val > Max)
31069 // Check if each element of the vector is left-shifted by one.
31070 auto LHS = In.getOperand(0);
31071 auto RHS = In.getOperand(1);
31072 if (!IsConstVectorInRange(RHS, 1, 1))
31074 if (LHS.getOpcode() != ISD::ADD)
31077 // Detect a pattern of a + b + 1 where the order doesn't matter.
31078 SDValue Operands[3];
31079 Operands[0] = LHS.getOperand(0);
31080 Operands[1] = LHS.getOperand(1);
31082 // Take care of the case when one of the operands is a constant vector whose
31083 // element is in the range [1, 256].
31084 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
31085 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
31086 Operands[0].getOperand(0).getValueType() == VT) {
31087 // The pattern is detected. Subtract one from the constant vector, then
31088 // demote it and emit X86ISD::AVG instruction.
31089 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
31090 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
31091 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
31092 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31096 if (Operands[0].getOpcode() == ISD::ADD)
31097 std::swap(Operands[0], Operands[1]);
31098 else if (Operands[1].getOpcode() != ISD::ADD)
31100 Operands[2] = Operands[1].getOperand(0);
31101 Operands[1] = Operands[1].getOperand(1);
31103 // Now we have three operands of two additions. Check that one of them is a
31104 // constant vector with ones, and the other two are promoted from i8/i16.
31105 for (int i = 0; i < 3; ++i) {
31106 if (!IsConstVectorInRange(Operands[i], 1, 1))
31108 std::swap(Operands[i], Operands[2]);
31110 // Check if Operands[0] and Operands[1] are results of type promotion.
31111 for (int j = 0; j < 2; ++j)
31112 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
31113 Operands[j].getOperand(0).getValueType() != VT)
31116 // The pattern is detected, emit X86ISD::AVG instruction.
31117 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31118 Operands[1].getOperand(0));
31124 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
31125 TargetLowering::DAGCombinerInfo &DCI,
31126 const X86Subtarget &Subtarget) {
31127 LoadSDNode *Ld = cast<LoadSDNode>(N);
31128 EVT RegVT = Ld->getValueType(0);
31129 EVT MemVT = Ld->getMemoryVT();
31131 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31133 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
31134 // into two 16-byte operations.
31135 ISD::LoadExtType Ext = Ld->getExtensionType();
31137 unsigned AddressSpace = Ld->getAddressSpace();
31138 unsigned Alignment = Ld->getAlignment();
31139 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
31140 Ext == ISD::NON_EXTLOAD &&
31141 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
31142 AddressSpace, Alignment, &Fast) && !Fast) {
31143 unsigned NumElems = RegVT.getVectorNumElements();
31147 SDValue Ptr = Ld->getBasePtr();
31149 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
31152 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31153 Alignment, Ld->getMemOperand()->getFlags());
31155 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
31157 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31158 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
31159 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31161 Load2.getValue(1));
31163 SDValue NewVec = DAG.getUNDEF(RegVT);
31164 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
31165 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
31166 return DCI.CombineTo(N, NewVec, TF, true);
31172 /// If V is a build vector of boolean constants and exactly one of those
31173 /// constants is true, return the operand index of that true element.
31174 /// Otherwise, return -1.
31175 static int getOneTrueElt(SDValue V) {
31176 // This needs to be a build vector of booleans.
31177 // TODO: Checking for the i1 type matches the IR definition for the mask,
31178 // but the mask check could be loosened to i8 or other types. That might
31179 // also require checking more than 'allOnesValue'; eg, the x86 HW
31180 // instructions only require that the MSB is set for each mask element.
31181 // The ISD::MSTORE comments/definition do not specify how the mask operand
31183 auto *BV = dyn_cast<BuildVectorSDNode>(V);
31184 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
31187 int TrueIndex = -1;
31188 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
31189 for (unsigned i = 0; i < NumElts; ++i) {
31190 const SDValue &Op = BV->getOperand(i);
31193 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
31196 if (ConstNode->getAPIntValue().isAllOnesValue()) {
31197 // If we already found a one, this is too many.
31198 if (TrueIndex >= 0)
31206 /// Given a masked memory load/store operation, return true if it has one mask
31207 /// bit set. If it has one mask bit set, then also return the memory address of
31208 /// the scalar element to load/store, the vector index to insert/extract that
31209 /// scalar element, and the alignment for the scalar memory access.
31210 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
31211 SelectionDAG &DAG, SDValue &Addr,
31212 SDValue &Index, unsigned &Alignment) {
31213 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
31214 if (TrueMaskElt < 0)
31217 // Get the address of the one scalar element that is specified by the mask
31218 // using the appropriate offset from the base pointer.
31219 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
31220 Addr = MaskedOp->getBasePtr();
31221 if (TrueMaskElt != 0) {
31222 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
31223 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
31226 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
31227 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
31231 /// If exactly one element of the mask is set for a non-extending masked load,
31232 /// it is a scalar load and vector insert.
31233 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31234 /// mask have already been optimized in IR, so we don't bother with those here.
31236 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31237 TargetLowering::DAGCombinerInfo &DCI) {
31238 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31239 // However, some target hooks may need to be added to know when the transform
31240 // is profitable. Endianness would also have to be considered.
31242 SDValue Addr, VecIndex;
31243 unsigned Alignment;
31244 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
31247 // Load the one scalar element that is specified by the mask using the
31248 // appropriate offset from the base pointer.
31250 EVT VT = ML->getValueType(0);
31251 EVT EltVT = VT.getVectorElementType();
31253 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
31254 Alignment, ML->getMemOperand()->getFlags());
31256 // Insert the loaded element into the appropriate place in the vector.
31257 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
31259 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
31263 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31264 TargetLowering::DAGCombinerInfo &DCI) {
31265 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
31269 EVT VT = ML->getValueType(0);
31271 // If we are loading the first and last elements of a vector, it is safe and
31272 // always faster to load the whole vector. Replace the masked load with a
31273 // vector load and select.
31274 unsigned NumElts = VT.getVectorNumElements();
31275 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
31276 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
31277 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
31278 if (LoadFirstElt && LoadLastElt) {
31279 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31280 ML->getMemOperand());
31281 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
31282 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
31285 // Convert a masked load with a constant mask into a masked load and a select.
31286 // This allows the select operation to use a faster kind of select instruction
31287 // (for example, vblendvps -> vblendps).
31289 // Don't try this if the pass-through operand is already undefined. That would
31290 // cause an infinite loop because that's what we're about to create.
31291 if (ML->getSrc0().isUndef())
31294 // The new masked load has an undef pass-through operand. The select uses the
31295 // original pass-through operand.
31296 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31297 ML->getMask(), DAG.getUNDEF(VT),
31298 ML->getMemoryVT(), ML->getMemOperand(),
31299 ML->getExtensionType());
31300 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
31302 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
31305 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
31306 TargetLowering::DAGCombinerInfo &DCI,
31307 const X86Subtarget &Subtarget) {
31308 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
31310 // TODO: Expanding load with constant mask may be optimized as well.
31311 if (Mld->isExpandingLoad())
31314 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
31315 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
31317 // TODO: Do some AVX512 subsets benefit from this transform?
31318 if (!Subtarget.hasAVX512())
31319 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
31323 if (Mld->getExtensionType() != ISD::SEXTLOAD)
31326 // Resolve extending loads.
31327 EVT VT = Mld->getValueType(0);
31328 unsigned NumElems = VT.getVectorNumElements();
31329 EVT LdVT = Mld->getMemoryVT();
31332 assert(LdVT != VT && "Cannot extend to the same type");
31333 unsigned ToSz = VT.getScalarSizeInBits();
31334 unsigned FromSz = LdVT.getScalarSizeInBits();
31335 // From/To sizes and ElemCount must be pow of two.
31336 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31337 "Unexpected size for extending masked load");
31339 unsigned SizeRatio = ToSz / FromSz;
31340 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
31342 // Create a type on which we perform the shuffle.
31343 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31344 LdVT.getScalarType(), NumElems*SizeRatio);
31345 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31347 // Convert Src0 value.
31348 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
31349 if (!Mld->getSrc0().isUndef()) {
31350 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31351 for (unsigned i = 0; i != NumElems; ++i)
31352 ShuffleVec[i] = i * SizeRatio;
31354 // Can't shuffle using an illegal type.
31355 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31356 "WideVecVT should be legal");
31357 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
31358 DAG.getUNDEF(WideVecVT), ShuffleVec);
31360 // Prepare the new mask.
31362 SDValue Mask = Mld->getMask();
31363 if (Mask.getValueType() == VT) {
31364 // Mask and original value have the same type.
31365 NewMask = DAG.getBitcast(WideVecVT, Mask);
31366 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31367 for (unsigned i = 0; i != NumElems; ++i)
31368 ShuffleVec[i] = i * SizeRatio;
31369 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
31370 ShuffleVec[i] = NumElems * SizeRatio;
31371 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31372 DAG.getConstant(0, dl, WideVecVT),
31375 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31376 unsigned WidenNumElts = NumElems*SizeRatio;
31377 unsigned MaskNumElts = VT.getVectorNumElements();
31378 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31381 unsigned NumConcat = WidenNumElts / MaskNumElts;
31382 SmallVector<SDValue, 16> Ops(NumConcat);
31383 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31385 for (unsigned i = 1; i != NumConcat; ++i)
31388 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31391 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
31392 Mld->getBasePtr(), NewMask, WideSrc0,
31393 Mld->getMemoryVT(), Mld->getMemOperand(),
31395 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
31396 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
31399 /// If exactly one element of the mask is set for a non-truncating masked store,
31400 /// it is a vector extract and scalar store.
31401 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31402 /// mask have already been optimized in IR, so we don't bother with those here.
31403 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
31404 SelectionDAG &DAG) {
31405 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31406 // However, some target hooks may need to be added to know when the transform
31407 // is profitable. Endianness would also have to be considered.
31409 SDValue Addr, VecIndex;
31410 unsigned Alignment;
31411 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
31414 // Extract the one scalar element that is actually being stored.
31416 EVT VT = MS->getValue().getValueType();
31417 EVT EltVT = VT.getVectorElementType();
31418 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
31419 MS->getValue(), VecIndex);
31421 // Store that element at the appropriate offset from the base pointer.
31422 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
31423 Alignment, MS->getMemOperand()->getFlags());
31426 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
31427 const X86Subtarget &Subtarget) {
31428 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
31430 if (Mst->isCompressingStore())
31433 if (!Mst->isTruncatingStore())
31434 return reduceMaskedStoreToScalarStore(Mst, DAG);
31436 // Resolve truncating stores.
31437 EVT VT = Mst->getValue().getValueType();
31438 unsigned NumElems = VT.getVectorNumElements();
31439 EVT StVT = Mst->getMemoryVT();
31442 assert(StVT != VT && "Cannot truncate to the same type");
31443 unsigned FromSz = VT.getScalarSizeInBits();
31444 unsigned ToSz = StVT.getScalarSizeInBits();
31446 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31448 // The truncating store is legal in some cases. For example
31449 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31450 // are designated for truncate store.
31451 // In this case we don't need any further transformations.
31452 if (TLI.isTruncStoreLegal(VT, StVT))
31455 // From/To sizes and ElemCount must be pow of two.
31456 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31457 "Unexpected size for truncating masked store");
31458 // We are going to use the original vector elt for storing.
31459 // Accumulated smaller vector elements must be a multiple of the store size.
31460 assert (((NumElems * FromSz) % ToSz) == 0 &&
31461 "Unexpected ratio for truncating masked store");
31463 unsigned SizeRatio = FromSz / ToSz;
31464 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31466 // Create a type on which we perform the shuffle.
31467 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31468 StVT.getScalarType(), NumElems*SizeRatio);
31470 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31472 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
31473 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31474 for (unsigned i = 0; i != NumElems; ++i)
31475 ShuffleVec[i] = i * SizeRatio;
31477 // Can't shuffle using an illegal type.
31478 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31479 "WideVecVT should be legal");
31481 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31482 DAG.getUNDEF(WideVecVT),
31486 SDValue Mask = Mst->getMask();
31487 if (Mask.getValueType() == VT) {
31488 // Mask and original value have the same type.
31489 NewMask = DAG.getBitcast(WideVecVT, Mask);
31490 for (unsigned i = 0; i != NumElems; ++i)
31491 ShuffleVec[i] = i * SizeRatio;
31492 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
31493 ShuffleVec[i] = NumElems*SizeRatio;
31494 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31495 DAG.getConstant(0, dl, WideVecVT),
31498 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31499 unsigned WidenNumElts = NumElems*SizeRatio;
31500 unsigned MaskNumElts = VT.getVectorNumElements();
31501 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31504 unsigned NumConcat = WidenNumElts / MaskNumElts;
31505 SmallVector<SDValue, 16> Ops(NumConcat);
31506 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31508 for (unsigned i = 1; i != NumConcat; ++i)
31511 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31514 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
31515 Mst->getBasePtr(), NewMask, StVT,
31516 Mst->getMemOperand(), false);
31519 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
31520 const X86Subtarget &Subtarget) {
31521 StoreSDNode *St = cast<StoreSDNode>(N);
31522 EVT VT = St->getValue().getValueType();
31523 EVT StVT = St->getMemoryVT();
31525 SDValue StoredVal = St->getOperand(1);
31526 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31528 // If we are saving a concatenation of two XMM registers and 32-byte stores
31529 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
31531 unsigned AddressSpace = St->getAddressSpace();
31532 unsigned Alignment = St->getAlignment();
31533 if (VT.is256BitVector() && StVT == VT &&
31534 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
31535 AddressSpace, Alignment, &Fast) &&
31537 unsigned NumElems = VT.getVectorNumElements();
31541 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
31542 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
31544 SDValue Ptr0 = St->getBasePtr();
31545 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
31548 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
31549 Alignment, St->getMemOperand()->getFlags());
31551 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
31552 std::min(16U, Alignment), St->getMemOperand()->getFlags());
31553 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
31556 // Optimize trunc store (of multiple scalars) to shuffle and store.
31557 // First, pack all of the elements in one place. Next, store to memory
31558 // in fewer chunks.
31559 if (St->isTruncatingStore() && VT.isVector()) {
31560 // Check if we can detect an AVG pattern from the truncation. If yes,
31561 // replace the trunc store by a normal store with the result of X86ISD::AVG
31563 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
31565 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
31566 St->getPointerInfo(), St->getAlignment(),
31567 St->getMemOperand()->getFlags());
31570 detectUSatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
31571 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
31572 dl, Val, St->getBasePtr(),
31573 St->getMemoryVT(), St->getMemOperand(), DAG);
31575 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31576 unsigned NumElems = VT.getVectorNumElements();
31577 assert(StVT != VT && "Cannot truncate to the same type");
31578 unsigned FromSz = VT.getScalarSizeInBits();
31579 unsigned ToSz = StVT.getScalarSizeInBits();
31581 // The truncating store is legal in some cases. For example
31582 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31583 // are designated for truncate store.
31584 // In this case we don't need any further transformations.
31585 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
31588 // From, To sizes and ElemCount must be pow of two
31589 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
31590 // We are going to use the original vector elt for storing.
31591 // Accumulated smaller vector elements must be a multiple of the store size.
31592 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
31594 unsigned SizeRatio = FromSz / ToSz;
31596 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31598 // Create a type on which we perform the shuffle
31599 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31600 StVT.getScalarType(), NumElems*SizeRatio);
31602 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31604 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
31605 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
31606 for (unsigned i = 0; i != NumElems; ++i)
31607 ShuffleVec[i] = i * SizeRatio;
31609 // Can't shuffle using an illegal type.
31610 if (!TLI.isTypeLegal(WideVecVT))
31613 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31614 DAG.getUNDEF(WideVecVT),
31616 // At this point all of the data is stored at the bottom of the
31617 // register. We now need to save it to mem.
31619 // Find the largest store unit
31620 MVT StoreType = MVT::i8;
31621 for (MVT Tp : MVT::integer_valuetypes()) {
31622 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
31626 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
31627 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
31628 (64 <= NumElems * ToSz))
31629 StoreType = MVT::f64;
31631 // Bitcast the original vector into a vector of store-size units
31632 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
31633 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
31634 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
31635 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
31636 SmallVector<SDValue, 8> Chains;
31637 SDValue Ptr = St->getBasePtr();
31639 // Perform one or more big stores into memory.
31640 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
31641 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
31642 StoreType, ShuffWide,
31643 DAG.getIntPtrConstant(i, dl));
31645 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
31646 St->getAlignment(), St->getMemOperand()->getFlags());
31647 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
31648 Chains.push_back(Ch);
31651 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
31654 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
31655 // the FP state in cases where an emms may be missing.
31656 // A preferable solution to the general problem is to figure out the right
31657 // places to insert EMMS. This qualifies as a quick hack.
31659 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
31660 if (VT.getSizeInBits() != 64)
31663 const Function *F = DAG.getMachineFunction().getFunction();
31664 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
31666 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
31667 if ((VT.isVector() ||
31668 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
31669 isa<LoadSDNode>(St->getValue()) &&
31670 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
31671 St->getChain().hasOneUse() && !St->isVolatile()) {
31672 SDNode* LdVal = St->getValue().getNode();
31673 LoadSDNode *Ld = nullptr;
31674 int TokenFactorIndex = -1;
31675 SmallVector<SDValue, 8> Ops;
31676 SDNode* ChainVal = St->getChain().getNode();
31677 // Must be a store of a load. We currently handle two cases: the load
31678 // is a direct child, and it's under an intervening TokenFactor. It is
31679 // possible to dig deeper under nested TokenFactors.
31680 if (ChainVal == LdVal)
31681 Ld = cast<LoadSDNode>(St->getChain());
31682 else if (St->getValue().hasOneUse() &&
31683 ChainVal->getOpcode() == ISD::TokenFactor) {
31684 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
31685 if (ChainVal->getOperand(i).getNode() == LdVal) {
31686 TokenFactorIndex = i;
31687 Ld = cast<LoadSDNode>(St->getValue());
31689 Ops.push_back(ChainVal->getOperand(i));
31693 if (!Ld || !ISD::isNormalLoad(Ld))
31696 // If this is not the MMX case, i.e. we are just turning i64 load/store
31697 // into f64 load/store, avoid the transformation if there are multiple
31698 // uses of the loaded value.
31699 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
31704 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
31705 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
31707 if (Subtarget.is64Bit() || F64IsLegal) {
31708 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
31709 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
31710 Ld->getPointerInfo(), Ld->getAlignment(),
31711 Ld->getMemOperand()->getFlags());
31712 SDValue NewChain = NewLd.getValue(1);
31713 if (TokenFactorIndex >= 0) {
31714 Ops.push_back(NewChain);
31715 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31717 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
31718 St->getPointerInfo(), St->getAlignment(),
31719 St->getMemOperand()->getFlags());
31722 // Otherwise, lower to two pairs of 32-bit loads / stores.
31723 SDValue LoAddr = Ld->getBasePtr();
31724 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
31726 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
31727 Ld->getPointerInfo(), Ld->getAlignment(),
31728 Ld->getMemOperand()->getFlags());
31729 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
31730 Ld->getPointerInfo().getWithOffset(4),
31731 MinAlign(Ld->getAlignment(), 4),
31732 Ld->getMemOperand()->getFlags());
31734 SDValue NewChain = LoLd.getValue(1);
31735 if (TokenFactorIndex >= 0) {
31736 Ops.push_back(LoLd);
31737 Ops.push_back(HiLd);
31738 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31741 LoAddr = St->getBasePtr();
31742 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
31745 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
31746 St->getAlignment(), St->getMemOperand()->getFlags());
31747 SDValue HiSt = DAG.getStore(
31748 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
31749 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
31750 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
31753 // This is similar to the above case, but here we handle a scalar 64-bit
31754 // integer store that is extracted from a vector on a 32-bit target.
31755 // If we have SSE2, then we can treat it like a floating-point double
31756 // to get past legalization. The execution dependencies fixup pass will
31757 // choose the optimal machine instruction for the store if this really is
31758 // an integer or v2f32 rather than an f64.
31759 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
31760 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
31761 SDValue OldExtract = St->getOperand(1);
31762 SDValue ExtOp0 = OldExtract.getOperand(0);
31763 unsigned VecSize = ExtOp0.getValueSizeInBits();
31764 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
31765 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
31766 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
31767 BitCast, OldExtract.getOperand(1));
31768 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
31769 St->getPointerInfo(), St->getAlignment(),
31770 St->getMemOperand()->getFlags());
31776 /// Return 'true' if this vector operation is "horizontal"
31777 /// and return the operands for the horizontal operation in LHS and RHS. A
31778 /// horizontal operation performs the binary operation on successive elements
31779 /// of its first operand, then on successive elements of its second operand,
31780 /// returning the resulting values in a vector. For example, if
31781 /// A = < float a0, float a1, float a2, float a3 >
31783 /// B = < float b0, float b1, float b2, float b3 >
31784 /// then the result of doing a horizontal operation on A and B is
31785 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
31786 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
31787 /// A horizontal-op B, for some already available A and B, and if so then LHS is
31788 /// set to A, RHS to B, and the routine returns 'true'.
31789 /// Note that the binary operation should have the property that if one of the
31790 /// operands is UNDEF then the result is UNDEF.
31791 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
31792 // Look for the following pattern: if
31793 // A = < float a0, float a1, float a2, float a3 >
31794 // B = < float b0, float b1, float b2, float b3 >
31796 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
31797 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
31798 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
31799 // which is A horizontal-op B.
31801 // At least one of the operands should be a vector shuffle.
31802 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
31803 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
31806 MVT VT = LHS.getSimpleValueType();
31808 assert((VT.is128BitVector() || VT.is256BitVector()) &&
31809 "Unsupported vector type for horizontal add/sub");
31811 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
31812 // operate independently on 128-bit lanes.
31813 unsigned NumElts = VT.getVectorNumElements();
31814 unsigned NumLanes = VT.getSizeInBits()/128;
31815 unsigned NumLaneElts = NumElts / NumLanes;
31816 assert((NumLaneElts % 2 == 0) &&
31817 "Vector type should have an even number of elements in each lane");
31818 unsigned HalfLaneElts = NumLaneElts/2;
31820 // View LHS in the form
31821 // LHS = VECTOR_SHUFFLE A, B, LMask
31822 // If LHS is not a shuffle then pretend it is the shuffle
31823 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
31824 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
31827 SmallVector<int, 16> LMask(NumElts);
31828 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31829 if (!LHS.getOperand(0).isUndef())
31830 A = LHS.getOperand(0);
31831 if (!LHS.getOperand(1).isUndef())
31832 B = LHS.getOperand(1);
31833 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
31834 std::copy(Mask.begin(), Mask.end(), LMask.begin());
31836 if (!LHS.isUndef())
31838 for (unsigned i = 0; i != NumElts; ++i)
31842 // Likewise, view RHS in the form
31843 // RHS = VECTOR_SHUFFLE C, D, RMask
31845 SmallVector<int, 16> RMask(NumElts);
31846 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31847 if (!RHS.getOperand(0).isUndef())
31848 C = RHS.getOperand(0);
31849 if (!RHS.getOperand(1).isUndef())
31850 D = RHS.getOperand(1);
31851 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
31852 std::copy(Mask.begin(), Mask.end(), RMask.begin());
31854 if (!RHS.isUndef())
31856 for (unsigned i = 0; i != NumElts; ++i)
31860 // Check that the shuffles are both shuffling the same vectors.
31861 if (!(A == C && B == D) && !(A == D && B == C))
31864 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
31865 if (!A.getNode() && !B.getNode())
31868 // If A and B occur in reverse order in RHS, then "swap" them (which means
31869 // rewriting the mask).
31871 ShuffleVectorSDNode::commuteMask(RMask);
31873 // At this point LHS and RHS are equivalent to
31874 // LHS = VECTOR_SHUFFLE A, B, LMask
31875 // RHS = VECTOR_SHUFFLE A, B, RMask
31876 // Check that the masks correspond to performing a horizontal operation.
31877 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
31878 for (unsigned i = 0; i != NumLaneElts; ++i) {
31879 int LIdx = LMask[i+l], RIdx = RMask[i+l];
31881 // Ignore any UNDEF components.
31882 if (LIdx < 0 || RIdx < 0 ||
31883 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
31884 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
31887 // Check that successive elements are being operated on. If not, this is
31888 // not a horizontal operation.
31889 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
31890 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
31891 if (!(LIdx == Index && RIdx == Index + 1) &&
31892 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
31897 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
31898 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
31902 /// Do target-specific dag combines on floating-point adds/subs.
31903 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
31904 const X86Subtarget &Subtarget) {
31905 EVT VT = N->getValueType(0);
31906 SDValue LHS = N->getOperand(0);
31907 SDValue RHS = N->getOperand(1);
31908 bool IsFadd = N->getOpcode() == ISD::FADD;
31909 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
31911 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
31912 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
31913 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
31914 isHorizontalBinOp(LHS, RHS, IsFadd)) {
31915 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
31916 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
31921 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
31923 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
31924 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
31925 const X86Subtarget &Subtarget,
31927 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
31928 SDValue Src = N->getOperand(0);
31929 unsigned Opcode = Src.getOpcode();
31930 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31932 EVT VT = N->getValueType(0);
31933 EVT SrcVT = Src.getValueType();
31935 auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
31936 // TODO: Add extra cases where we can truncate both inputs for the
31937 // cost of one (or none).
31938 // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
31942 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
31943 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
31944 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
31945 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
31948 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
31949 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
31950 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
31951 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
31954 // Don't combine if the operation has other uses.
31955 if (!N->isOnlyUserOf(Src.getNode()))
31958 // Only support vector truncation for now.
31959 // TODO: i64 scalar math would benefit as well.
31960 if (!VT.isVector())
31963 // In most cases its only worth pre-truncating if we're only facing the cost
31964 // of one truncation.
31965 // i.e. if one of the inputs will constant fold or the input is repeated.
31970 SDValue Op0 = Src.getOperand(0);
31971 SDValue Op1 = Src.getOperand(1);
31972 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
31973 IsRepeatedOpOrOneUseConstant(Op0, Op1))
31974 return TruncateArithmetic(Op0, Op1);
31979 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
31980 // better to truncate if we have the chance.
31981 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
31982 !TLI.isOperationLegal(Opcode, SrcVT))
31983 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
31986 SDValue Op0 = Src.getOperand(0);
31987 SDValue Op1 = Src.getOperand(1);
31988 if (TLI.isOperationLegal(Opcode, VT) &&
31989 IsRepeatedOpOrOneUseConstant(Op0, Op1))
31990 return TruncateArithmetic(Op0, Op1);
31998 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
32000 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
32001 SmallVector<SDValue, 8> &Regs) {
32002 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
32003 Regs[0].getValueType() == MVT::v2i64));
32004 EVT OutVT = N->getValueType(0);
32005 EVT OutSVT = OutVT.getVectorElementType();
32006 EVT InVT = Regs[0].getValueType();
32007 EVT InSVT = InVT.getVectorElementType();
32010 // First, use mask to unset all bits that won't appear in the result.
32011 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
32012 "OutSVT can only be either i8 or i16.");
32014 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
32015 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
32016 for (auto &Reg : Regs)
32017 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
32019 MVT UnpackedVT, PackedVT;
32020 if (OutSVT == MVT::i8) {
32021 UnpackedVT = MVT::v8i16;
32022 PackedVT = MVT::v16i8;
32024 UnpackedVT = MVT::v4i32;
32025 PackedVT = MVT::v8i16;
32028 // In each iteration, truncate the type by a half size.
32029 auto RegNum = Regs.size();
32030 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
32031 j < e; j *= 2, RegNum /= 2) {
32032 for (unsigned i = 0; i < RegNum; i++)
32033 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
32034 for (unsigned i = 0; i < RegNum / 2; i++)
32035 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
32039 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
32040 // then extract a subvector as the result since v8i8 is not a legal type.
32041 if (OutVT == MVT::v8i8) {
32042 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
32043 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
32044 DAG.getIntPtrConstant(0, DL));
32046 } else if (RegNum > 1) {
32047 Regs.resize(RegNum);
32048 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
32053 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
32055 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
32057 SmallVector<SDValue, 8> &Regs) {
32058 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
32059 EVT OutVT = N->getValueType(0);
32062 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
32063 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
32064 for (auto &Reg : Regs) {
32065 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
32067 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
32071 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
32072 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
32075 if (Regs.size() > 2) {
32076 Regs.resize(Regs.size() / 2);
32077 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
32082 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
32083 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
32084 /// legalization the truncation will be translated into a BUILD_VECTOR with each
32085 /// element that is extracted from a vector and then truncated, and it is
32086 /// difficult to do this optimization based on them.
32087 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
32088 const X86Subtarget &Subtarget) {
32089 EVT OutVT = N->getValueType(0);
32090 if (!OutVT.isVector())
32093 SDValue In = N->getOperand(0);
32094 if (!In.getValueType().isSimple())
32097 EVT InVT = In.getValueType();
32098 unsigned NumElems = OutVT.getVectorNumElements();
32100 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
32101 // SSE2, and we need to take care of it specially.
32102 // AVX512 provides vpmovdb.
32103 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
32106 EVT OutSVT = OutVT.getVectorElementType();
32107 EVT InSVT = InVT.getVectorElementType();
32108 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
32109 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
32113 // SSSE3's pshufb results in less instructions in the cases below.
32114 if (Subtarget.hasSSSE3() && NumElems == 8 &&
32115 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
32116 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
32121 // Split a long vector into vectors of legal type.
32122 unsigned RegNum = InVT.getSizeInBits() / 128;
32123 SmallVector<SDValue, 8> SubVec(RegNum);
32124 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
32125 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
32127 for (unsigned i = 0; i < RegNum; i++)
32128 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
32129 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
32131 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
32132 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
32133 // truncate 2 x v4i32 to v8i16.
32134 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
32135 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
32136 else if (InSVT == MVT::i32)
32137 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
32142 /// This function transforms vector truncation of 'all or none' bits values.
32143 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
32144 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
32146 const X86Subtarget &Subtarget) {
32147 // Requires SSE2 but AVX512 has fast truncate.
32148 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
32151 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
32154 SDValue In = N->getOperand(0);
32155 if (!In.getValueType().isSimple())
32158 MVT VT = N->getValueType(0).getSimpleVT();
32159 MVT SVT = VT.getScalarType();
32161 MVT InVT = In.getValueType().getSimpleVT();
32162 MVT InSVT = InVT.getScalarType();
32164 // Use PACKSS if the input is a splatted sign bit.
32165 // e.g. Comparison result, sext_in_reg, etc.
32166 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
32167 if (NumSignBits != InSVT.getSizeInBits())
32170 // Check we have a truncation suited for PACKSS.
32171 if (!VT.is128BitVector() && !VT.is256BitVector())
32173 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
32175 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
32178 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
32181 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
32182 const X86Subtarget &Subtarget) {
32183 EVT VT = N->getValueType(0);
32184 SDValue Src = N->getOperand(0);
32187 // Attempt to pre-truncate inputs to arithmetic ops instead.
32188 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
32191 // Try to detect AVG pattern first.
32192 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
32195 // Try the truncation with unsigned saturation.
32196 if (SDValue Val = detectUSatPattern(Src, VT, Subtarget))
32197 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, Val);
32199 // The bitcast source is a direct mmx result.
32200 // Detect bitcasts between i32 to x86mmx
32201 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
32202 SDValue BCSrc = Src.getOperand(0);
32203 if (BCSrc.getValueType() == MVT::x86mmx)
32204 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
32207 // Try to truncate extended sign bits with PACKSS.
32208 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
32211 return combineVectorTruncation(N, DAG, Subtarget);
32214 /// Returns the negated value if the node \p N flips sign of FP value.
32216 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
32217 /// AVX512F does not have FXOR, so FNEG is lowered as
32218 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
32219 /// In this case we go though all bitcasts.
32220 static SDValue isFNEG(SDNode *N) {
32221 if (N->getOpcode() == ISD::FNEG)
32222 return N->getOperand(0);
32224 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
32225 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
32228 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
32229 if (!Op1.getValueType().isFloatingPoint())
32232 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
32234 unsigned EltBits = Op1.getScalarValueSizeInBits();
32235 auto isSignBitValue = [&](const ConstantFP *C) {
32236 return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
32239 // There is more than one way to represent the same constant on
32240 // the different X86 targets. The type of the node may also depend on size.
32241 // - load scalar value and broadcast
32242 // - BUILD_VECTOR node
32243 // - load from a constant pool.
32244 // We check all variants here.
32245 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
32246 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
32247 if (isSignBitValue(cast<ConstantFP>(C)))
32250 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
32251 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
32252 if (isSignBitValue(CN->getConstantFPValue()))
32255 } else if (auto *C = getTargetConstantFromNode(Op1)) {
32256 if (C->getType()->isVectorTy()) {
32257 if (auto *SplatV = C->getSplatValue())
32258 if (isSignBitValue(cast<ConstantFP>(SplatV)))
32260 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
32261 if (isSignBitValue(FPConst))
32267 /// Do target-specific dag combines on floating point negations.
32268 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
32269 const X86Subtarget &Subtarget) {
32270 EVT OrigVT = N->getValueType(0);
32271 SDValue Arg = isFNEG(N);
32272 assert(Arg.getNode() && "N is expected to be an FNEG node");
32274 EVT VT = Arg.getValueType();
32275 EVT SVT = VT.getScalarType();
32278 // Let legalize expand this if it isn't a legal type yet.
32279 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32282 // If we're negating a FMUL node on a target with FMA, then we can avoid the
32283 // use of a constant by performing (-0 - A*B) instead.
32284 // FIXME: Check rounding control flags as well once it becomes available.
32285 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
32286 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
32287 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
32288 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
32289 Arg.getOperand(1), Zero);
32290 return DAG.getBitcast(OrigVT, NewNode);
32293 // If we're negating an FMA node, then we can adjust the
32294 // instruction to include the extra negation.
32295 unsigned NewOpcode = 0;
32296 if (Arg.hasOneUse()) {
32297 switch (Arg.getOpcode()) {
32298 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
32299 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
32300 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
32301 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
32302 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
32303 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
32304 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
32305 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
32306 // We can't handle scalar intrinsic node here because it would only
32307 // invert one element and not the whole vector. But we could try to handle
32308 // a negation of the lower element only.
32312 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
32313 Arg.getNode()->ops()));
32318 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
32319 const X86Subtarget &Subtarget) {
32320 MVT VT = N->getSimpleValueType(0);
32321 // If we have integer vector types available, use the integer opcodes.
32322 if (VT.isVector() && Subtarget.hasSSE2()) {
32325 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
32327 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
32328 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
32329 unsigned IntOpcode;
32330 switch (N->getOpcode()) {
32331 default: llvm_unreachable("Unexpected FP logic op");
32332 case X86ISD::FOR: IntOpcode = ISD::OR; break;
32333 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
32334 case X86ISD::FAND: IntOpcode = ISD::AND; break;
32335 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
32337 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
32338 return DAG.getBitcast(VT, IntOp);
32343 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
32344 TargetLowering::DAGCombinerInfo &DCI,
32345 const X86Subtarget &Subtarget) {
32346 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
32349 if (DCI.isBeforeLegalizeOps())
32352 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
32355 if (Subtarget.hasCMov())
32356 if (SDValue RV = combineIntegerAbs(N, DAG))
32359 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32363 return combineFneg(N, DAG, Subtarget);
32368 static bool isNullFPScalarOrVectorConst(SDValue V) {
32369 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
32372 /// If a value is a scalar FP zero or a vector FP zero (potentially including
32373 /// undefined elements), return a zero constant that may be used to fold away
32374 /// that value. In the case of a vector, the returned constant will not contain
32375 /// undefined elements even if the input parameter does. This makes it suitable
32376 /// to be used as a replacement operand with operations (eg, bitwise-and) where
32377 /// an undef should not propagate.
32378 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
32379 const X86Subtarget &Subtarget) {
32380 if (!isNullFPScalarOrVectorConst(V))
32383 if (V.getValueType().isVector())
32384 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
32389 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
32390 const X86Subtarget &Subtarget) {
32391 SDValue N0 = N->getOperand(0);
32392 SDValue N1 = N->getOperand(1);
32393 EVT VT = N->getValueType(0);
32396 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
32397 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
32398 (VT == MVT::f64 && Subtarget.hasSSE2())))
32401 auto isAllOnesConstantFP = [](SDValue V) {
32402 auto *C = dyn_cast<ConstantFPSDNode>(V);
32403 return C && C->getConstantFPValue()->isAllOnesValue();
32406 // fand (fxor X, -1), Y --> fandn X, Y
32407 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
32408 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
32410 // fand X, (fxor Y, -1) --> fandn Y, X
32411 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
32412 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
32417 /// Do target-specific dag combines on X86ISD::FAND nodes.
32418 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
32419 const X86Subtarget &Subtarget) {
32420 // FAND(0.0, x) -> 0.0
32421 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
32424 // FAND(x, 0.0) -> 0.0
32425 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32428 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
32431 return lowerX86FPLogicOp(N, DAG, Subtarget);
32434 /// Do target-specific dag combines on X86ISD::FANDN nodes.
32435 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
32436 const X86Subtarget &Subtarget) {
32437 // FANDN(0.0, x) -> x
32438 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32439 return N->getOperand(1);
32441 // FANDN(x, 0.0) -> 0.0
32442 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32445 return lowerX86FPLogicOp(N, DAG, Subtarget);
32448 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
32449 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
32450 const X86Subtarget &Subtarget) {
32451 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
32453 // F[X]OR(0.0, x) -> x
32454 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32455 return N->getOperand(1);
32457 // F[X]OR(x, 0.0) -> x
32458 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
32459 return N->getOperand(0);
32462 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
32465 return lowerX86FPLogicOp(N, DAG, Subtarget);
32468 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
32469 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
32470 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
32472 // Only perform optimizations if UnsafeMath is used.
32473 if (!DAG.getTarget().Options.UnsafeFPMath)
32476 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
32477 // into FMINC and FMAXC, which are Commutative operations.
32478 unsigned NewOp = 0;
32479 switch (N->getOpcode()) {
32480 default: llvm_unreachable("unknown opcode");
32481 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
32482 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
32485 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
32486 N->getOperand(0), N->getOperand(1));
32489 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
32490 const X86Subtarget &Subtarget) {
32491 if (Subtarget.useSoftFloat())
32494 // TODO: Check for global or instruction-level "nnan". In that case, we
32495 // should be able to lower to FMAX/FMIN alone.
32496 // TODO: If an operand is already known to be a NaN or not a NaN, this
32497 // should be an optional swap and FMAX/FMIN.
32499 EVT VT = N->getValueType(0);
32500 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
32501 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
32502 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
32505 // This takes at least 3 instructions, so favor a library call when operating
32506 // on a scalar and minimizing code size.
32507 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
32510 SDValue Op0 = N->getOperand(0);
32511 SDValue Op1 = N->getOperand(1);
32513 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
32514 DAG.getDataLayout(), *DAG.getContext(), VT);
32516 // There are 4 possibilities involving NaN inputs, and these are the required
32520 // ----------------
32521 // Num | Max | Op0 |
32522 // Op0 ----------------
32523 // NaN | Op1 | NaN |
32524 // ----------------
32526 // The SSE FP max/min instructions were not designed for this case, but rather
32528 // Min = Op1 < Op0 ? Op1 : Op0
32529 // Max = Op1 > Op0 ? Op1 : Op0
32531 // So they always return Op0 if either input is a NaN. However, we can still
32532 // use those instructions for fmaxnum by selecting away a NaN input.
32534 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
32535 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
32536 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
32537 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
32539 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
32540 // are NaN, the NaN value of Op1 is the result.
32541 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
32542 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
32545 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
32546 TargetLowering::DAGCombinerInfo &DCI) {
32547 // BT ignores high bits in the bit index operand.
32548 SDValue Op1 = N->getOperand(1);
32549 if (Op1.hasOneUse()) {
32550 unsigned BitWidth = Op1.getValueSizeInBits();
32551 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
32552 APInt KnownZero, KnownOne;
32553 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32554 !DCI.isBeforeLegalizeOps());
32555 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32556 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
32557 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
32558 DCI.CommitTargetLoweringOpt(TLO);
32563 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
32564 const X86Subtarget &Subtarget) {
32565 EVT VT = N->getValueType(0);
32566 if (!VT.isVector())
32569 SDValue N0 = N->getOperand(0);
32570 SDValue N1 = N->getOperand(1);
32571 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
32574 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
32575 // both SSE and AVX2 since there is no sign-extended shift right
32576 // operation on a vector with 64-bit elements.
32577 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
32578 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
32579 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
32580 N0.getOpcode() == ISD::SIGN_EXTEND)) {
32581 SDValue N00 = N0.getOperand(0);
32583 // EXTLOAD has a better solution on AVX2,
32584 // it may be replaced with X86ISD::VSEXT node.
32585 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
32586 if (!ISD::isNormalLoad(N00.getNode()))
32589 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
32590 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
32592 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
32598 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
32599 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
32600 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
32601 /// opportunities to combine math ops, use an LEA, or use a complex addressing
32602 /// mode. This can eliminate extend, add, and shift instructions.
32603 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
32604 const X86Subtarget &Subtarget) {
32605 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
32606 Ext->getOpcode() != ISD::ZERO_EXTEND)
32609 // TODO: This should be valid for other integer types.
32610 EVT VT = Ext->getValueType(0);
32611 if (VT != MVT::i64)
32614 SDValue Add = Ext->getOperand(0);
32615 if (Add.getOpcode() != ISD::ADD)
32618 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
32619 bool NSW = Add->getFlags()->hasNoSignedWrap();
32620 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
32622 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
32624 if ((Sext && !NSW) || (!Sext && !NUW))
32627 // Having a constant operand to the 'add' ensures that we are not increasing
32628 // the instruction count because the constant is extended for free below.
32629 // A constant operand can also become the displacement field of an LEA.
32630 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
32634 // Don't make the 'add' bigger if there's no hope of combining it with some
32635 // other 'add' or 'shl' instruction.
32636 // TODO: It may be profitable to generate simpler LEA instructions in place
32637 // of single 'add' instructions, but the cost model for selecting an LEA
32638 // currently has a high threshold.
32639 bool HasLEAPotential = false;
32640 for (auto *User : Ext->uses()) {
32641 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
32642 HasLEAPotential = true;
32646 if (!HasLEAPotential)
32649 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
32650 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
32651 SDValue AddOp0 = Add.getOperand(0);
32652 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
32653 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
32655 // The wider add is guaranteed to not wrap because both operands are
32658 Flags.setNoSignedWrap(NSW);
32659 Flags.setNoUnsignedWrap(NUW);
32660 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
32663 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
32664 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
32665 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
32666 /// extends from AH (which we otherwise need to do contortions to access).
32667 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
32668 SDValue N0 = N->getOperand(0);
32669 auto OpcodeN = N->getOpcode();
32670 auto OpcodeN0 = N0.getOpcode();
32671 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
32672 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
32675 EVT VT = N->getValueType(0);
32676 EVT InVT = N0.getValueType();
32677 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
32680 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
32681 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
32682 : X86ISD::UDIVREM8_ZEXT_HREG;
32683 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
32685 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
32686 return R.getValue(1);
32689 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
32690 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
32691 /// with UNDEFs) of the input to vectors of the same size as the target type
32692 /// which then extends the lowest elements.
32693 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
32694 TargetLowering::DAGCombinerInfo &DCI,
32695 const X86Subtarget &Subtarget) {
32696 unsigned Opcode = N->getOpcode();
32697 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
32699 if (!DCI.isBeforeLegalizeOps())
32701 if (!Subtarget.hasSSE2())
32704 SDValue N0 = N->getOperand(0);
32705 EVT VT = N->getValueType(0);
32706 EVT SVT = VT.getScalarType();
32707 EVT InVT = N0.getValueType();
32708 EVT InSVT = InVT.getScalarType();
32710 // Input type must be a vector and we must be extending legal integer types.
32711 if (!VT.isVector())
32713 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
32715 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
32718 // On AVX2+ targets, if the input/output types are both legal then we will be
32719 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
32720 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
32721 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
32726 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
32727 EVT InVT = N.getValueType();
32728 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
32729 Size / InVT.getScalarSizeInBits());
32730 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
32731 DAG.getUNDEF(InVT));
32733 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
32736 // If target-size is less than 128-bits, extend to a type that would extend
32737 // to 128 bits, extend that and extract the original target vector.
32738 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
32739 unsigned Scale = 128 / VT.getSizeInBits();
32741 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
32742 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
32743 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
32744 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
32745 DAG.getIntPtrConstant(0, DL));
32748 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
32749 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
32750 // Also use this if we don't have SSE41 to allow the legalizer do its job.
32751 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
32752 (VT.is256BitVector() && Subtarget.hasInt256()) ||
32753 (VT.is512BitVector() && Subtarget.hasAVX512())) {
32754 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
32755 return Opcode == ISD::SIGN_EXTEND
32756 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
32757 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
32760 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
32761 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
32762 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
32763 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
32764 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
32766 SmallVector<SDValue, 8> Opnds;
32767 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
32768 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
32769 DAG.getIntPtrConstant(Offset, DL));
32770 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
32771 SrcVec = Opcode == ISD::SIGN_EXTEND
32772 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
32773 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
32774 Opnds.push_back(SrcVec);
32776 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
32779 // On pre-AVX2 targets, split into 128-bit nodes of
32780 // ISD::*_EXTEND_VECTOR_INREG.
32781 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
32782 return SplitAndExtendInReg(128);
32784 // On pre-AVX512 targets, split into 256-bit nodes of
32785 // ISD::*_EXTEND_VECTOR_INREG.
32786 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
32787 return SplitAndExtendInReg(256);
32792 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
32793 TargetLowering::DAGCombinerInfo &DCI,
32794 const X86Subtarget &Subtarget) {
32795 SDValue N0 = N->getOperand(0);
32796 EVT VT = N->getValueType(0);
32797 EVT InVT = N0.getValueType();
32800 if (SDValue DivRem8 = getDivRem8(N, DAG))
32803 if (!DCI.isBeforeLegalizeOps()) {
32804 if (InVT == MVT::i1) {
32805 SDValue Zero = DAG.getConstant(0, DL, VT);
32807 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
32808 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
32813 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32816 if (Subtarget.hasAVX() && VT.is256BitVector())
32817 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32820 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32826 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
32827 const X86Subtarget &Subtarget) {
32829 EVT VT = N->getValueType(0);
32831 // Let legalize expand this if it isn't a legal type yet.
32832 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32835 EVT ScalarVT = VT.getScalarType();
32836 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
32839 SDValue A = N->getOperand(0);
32840 SDValue B = N->getOperand(1);
32841 SDValue C = N->getOperand(2);
32843 auto invertIfNegative = [](SDValue &V) {
32844 if (SDValue NegVal = isFNEG(V.getNode())) {
32851 // Do not convert the passthru input of scalar intrinsics.
32852 // FIXME: We could allow negations of the lower element only.
32853 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
32854 bool NegB = invertIfNegative(B);
32855 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
32857 // Negative multiplication when NegA xor NegB
32858 bool NegMul = (NegA != NegB);
32860 unsigned NewOpcode;
32862 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
32864 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
32867 if (N->getOpcode() == X86ISD::FMADD_RND) {
32868 switch (NewOpcode) {
32869 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
32870 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
32871 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
32872 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
32874 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
32875 switch (NewOpcode) {
32876 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
32877 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
32878 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
32879 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
32881 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
32882 switch (NewOpcode) {
32883 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
32884 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
32885 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
32886 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
32889 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
32890 "Unexpected opcode!");
32891 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
32894 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
32897 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
32898 TargetLowering::DAGCombinerInfo &DCI,
32899 const X86Subtarget &Subtarget) {
32900 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
32901 // (and (i32 x86isd::setcc_carry), 1)
32902 // This eliminates the zext. This transformation is necessary because
32903 // ISD::SETCC is always legalized to i8.
32905 SDValue N0 = N->getOperand(0);
32906 EVT VT = N->getValueType(0);
32908 if (N0.getOpcode() == ISD::AND &&
32910 N0.getOperand(0).hasOneUse()) {
32911 SDValue N00 = N0.getOperand(0);
32912 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32913 if (!isOneConstant(N0.getOperand(1)))
32915 return DAG.getNode(ISD::AND, dl, VT,
32916 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
32917 N00.getOperand(0), N00.getOperand(1)),
32918 DAG.getConstant(1, dl, VT));
32922 if (N0.getOpcode() == ISD::TRUNCATE &&
32924 N0.getOperand(0).hasOneUse()) {
32925 SDValue N00 = N0.getOperand(0);
32926 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32927 return DAG.getNode(ISD::AND, dl, VT,
32928 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
32929 N00.getOperand(0), N00.getOperand(1)),
32930 DAG.getConstant(1, dl, VT));
32934 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32937 if (VT.is256BitVector())
32938 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32941 if (SDValue DivRem8 = getDivRem8(N, DAG))
32944 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32947 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
32953 /// Optimize x == -y --> x+y == 0
32954 /// x != -y --> x+y != 0
32955 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
32956 const X86Subtarget &Subtarget) {
32957 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
32958 SDValue LHS = N->getOperand(0);
32959 SDValue RHS = N->getOperand(1);
32960 EVT VT = N->getValueType(0);
32963 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
32964 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
32965 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
32966 LHS.getOperand(1));
32967 return DAG.getSetCC(DL, N->getValueType(0), addV,
32968 DAG.getConstant(0, DL, addV.getValueType()), CC);
32970 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
32971 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
32972 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
32973 RHS.getOperand(1));
32974 return DAG.getSetCC(DL, N->getValueType(0), addV,
32975 DAG.getConstant(0, DL, addV.getValueType()), CC);
32978 if (VT.getScalarType() == MVT::i1 &&
32979 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
32981 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
32982 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
32983 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
32985 if (!IsSEXT0 || !IsVZero1) {
32986 // Swap the operands and update the condition code.
32987 std::swap(LHS, RHS);
32988 CC = ISD::getSetCCSwappedOperands(CC);
32990 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
32991 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
32992 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
32995 if (IsSEXT0 && IsVZero1) {
32996 assert(VT == LHS.getOperand(0).getValueType() &&
32997 "Uexpected operand type");
32998 if (CC == ISD::SETGT)
32999 return DAG.getConstant(0, DL, VT);
33000 if (CC == ISD::SETLE)
33001 return DAG.getConstant(1, DL, VT);
33002 if (CC == ISD::SETEQ || CC == ISD::SETGE)
33003 return DAG.getNOT(DL, LHS.getOperand(0), VT);
33005 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
33006 "Unexpected condition code!");
33007 return LHS.getOperand(0);
33011 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
33012 // to avoid scalarization via legalization because v4i32 is not a legal type.
33013 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
33014 LHS.getValueType() == MVT::v4f32)
33015 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
33020 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
33022 // Gather and Scatter instructions use k-registers for masks. The type of
33023 // the masks is v*i1. So the mask will be truncated anyway.
33024 // The SIGN_EXTEND_INREG my be dropped.
33025 SDValue Mask = N->getOperand(2);
33026 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
33027 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
33028 NewOps[2] = Mask.getOperand(0);
33029 DAG.UpdateNodeOperands(N, NewOps);
33034 // Helper function of performSETCCCombine. It is to materialize "setb reg"
33035 // as "sbb reg,reg", since it can be extended without zext and produces
33036 // an all-ones bit which is more useful than 0/1 in some cases.
33037 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
33038 SelectionDAG &DAG, MVT VT) {
33040 return DAG.getNode(ISD::AND, DL, VT,
33041 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
33042 DAG.getConstant(X86::COND_B, DL, MVT::i8),
33044 DAG.getConstant(1, DL, VT));
33045 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
33046 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
33047 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
33048 DAG.getConstant(X86::COND_B, DL, MVT::i8),
33052 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
33053 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
33054 TargetLowering::DAGCombinerInfo &DCI,
33055 const X86Subtarget &Subtarget) {
33057 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
33058 SDValue EFLAGS = N->getOperand(1);
33060 if (CC == X86::COND_A) {
33061 // Try to convert COND_A into COND_B in an attempt to facilitate
33062 // materializing "setb reg".
33064 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
33065 // cannot take an immediate as its first operand.
33067 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
33068 EFLAGS.getValueType().isInteger() &&
33069 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
33070 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
33071 EFLAGS.getNode()->getVTList(),
33072 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
33073 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
33074 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
33078 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
33079 // a zext and produces an all-ones bit which is more useful than 0/1 in some
33081 if (CC == X86::COND_B)
33082 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
33084 // Try to simplify the EFLAGS and condition code operands.
33085 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
33086 return getSETCC(CC, Flags, DL, DAG);
33091 /// Optimize branch condition evaluation.
33092 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
33093 TargetLowering::DAGCombinerInfo &DCI,
33094 const X86Subtarget &Subtarget) {
33096 SDValue EFLAGS = N->getOperand(3);
33097 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
33099 // Try to simplify the EFLAGS and condition code operands.
33100 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
33101 // RAUW them under us.
33102 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
33103 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
33104 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
33105 N->getOperand(1), Cond, Flags);
33111 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
33112 SelectionDAG &DAG) {
33113 // Take advantage of vector comparisons producing 0 or -1 in each lane to
33114 // optimize away operation when it's from a constant.
33116 // The general transformation is:
33117 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
33118 // AND(VECTOR_CMP(x,y), constant2)
33119 // constant2 = UNARYOP(constant)
33121 // Early exit if this isn't a vector operation, the operand of the
33122 // unary operation isn't a bitwise AND, or if the sizes of the operations
33123 // aren't the same.
33124 EVT VT = N->getValueType(0);
33125 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
33126 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
33127 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
33130 // Now check that the other operand of the AND is a constant. We could
33131 // make the transformation for non-constant splats as well, but it's unclear
33132 // that would be a benefit as it would not eliminate any operations, just
33133 // perform one more step in scalar code before moving to the vector unit.
33134 if (BuildVectorSDNode *BV =
33135 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
33136 // Bail out if the vector isn't a constant.
33137 if (!BV->isConstant())
33140 // Everything checks out. Build up the new and improved node.
33142 EVT IntVT = BV->getValueType(0);
33143 // Create a new constant of the appropriate type for the transformed
33145 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
33146 // The AND node needs bitcasts to/from an integer vector type around it.
33147 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
33148 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
33149 N->getOperand(0)->getOperand(0), MaskConst);
33150 SDValue Res = DAG.getBitcast(VT, NewAnd);
33157 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
33158 const X86Subtarget &Subtarget) {
33159 SDValue Op0 = N->getOperand(0);
33160 EVT VT = N->getValueType(0);
33161 EVT InVT = Op0.getValueType();
33162 EVT InSVT = InVT.getScalarType();
33163 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33165 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
33166 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
33167 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
33169 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33170 InVT.getVectorNumElements());
33171 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
33173 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
33174 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
33176 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33179 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
33180 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
33181 // the optimization here.
33182 if (DAG.SignBitIsZero(Op0))
33183 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
33188 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
33189 const X86Subtarget &Subtarget) {
33190 // First try to optimize away the conversion entirely when it's
33191 // conditionally from a constant. Vectors only.
33192 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
33195 // Now move on to more general possibilities.
33196 SDValue Op0 = N->getOperand(0);
33197 EVT VT = N->getValueType(0);
33198 EVT InVT = Op0.getValueType();
33199 EVT InSVT = InVT.getScalarType();
33201 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
33202 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
33203 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
33204 if (InVT.isVector() &&
33205 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
33206 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
33208 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33209 InVT.getVectorNumElements());
33210 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
33211 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33214 // Without AVX512DQ we only support i64 to float scalar conversion. For both
33215 // vectors and scalars, see if we know that the upper bits are all the sign
33216 // bit, in which case we can truncate the input to i32 and convert from that.
33217 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
33218 unsigned BitWidth = InVT.getScalarSizeInBits();
33219 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
33220 if (NumSignBits >= (BitWidth - 31)) {
33221 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
33222 if (InVT.isVector())
33223 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
33224 InVT.getVectorNumElements());
33226 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
33227 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
33231 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
33232 // a 32-bit target where SSE doesn't support i64->FP operations.
33233 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
33234 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
33235 EVT LdVT = Ld->getValueType(0);
33237 // This transformation is not supported if the result type is f16 or f128.
33238 if (VT == MVT::f16 || VT == MVT::f128)
33241 if (!Ld->isVolatile() && !VT.isVector() &&
33242 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
33243 !Subtarget.is64Bit() && LdVT == MVT::i64) {
33244 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
33245 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
33246 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
33253 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
33254 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
33255 X86TargetLowering::DAGCombinerInfo &DCI) {
33256 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
33257 // the result is either zero or one (depending on the input carry bit).
33258 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
33259 if (X86::isZeroNode(N->getOperand(0)) &&
33260 X86::isZeroNode(N->getOperand(1)) &&
33261 // We don't have a good way to replace an EFLAGS use, so only do this when
33263 SDValue(N, 1).use_empty()) {
33265 EVT VT = N->getValueType(0);
33266 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
33267 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
33268 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
33269 DAG.getConstant(X86::COND_B, DL,
33272 DAG.getConstant(1, DL, VT));
33273 return DCI.CombineTo(N, Res1, CarryOut);
33279 /// fold (add Y, (sete X, 0)) -> adc 0, Y
33280 /// (add Y, (setne X, 0)) -> sbb -1, Y
33281 /// (sub (sete X, 0), Y) -> sbb 0, Y
33282 /// (sub (setne X, 0), Y) -> adc -1, Y
33283 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
33286 // Look through ZExts.
33287 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
33288 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
33291 SDValue SetCC = Ext.getOperand(0);
33292 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
33295 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
33296 if (CC != X86::COND_E && CC != X86::COND_NE)
33299 SDValue Cmp = SetCC.getOperand(1);
33300 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
33301 !X86::isZeroNode(Cmp.getOperand(1)) ||
33302 !Cmp.getOperand(0).getValueType().isInteger())
33305 SDValue CmpOp0 = Cmp.getOperand(0);
33306 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
33307 DAG.getConstant(1, DL, CmpOp0.getValueType()));
33309 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
33310 if (CC == X86::COND_NE)
33311 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
33312 DL, OtherVal.getValueType(), OtherVal,
33313 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
33315 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
33316 DL, OtherVal.getValueType(), OtherVal,
33317 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
33320 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
33321 const X86Subtarget &Subtarget) {
33323 EVT VT = N->getValueType(0);
33324 SDValue Op0 = N->getOperand(0);
33325 SDValue Op1 = N->getOperand(1);
33327 // TODO: There's nothing special about i32, any integer type above i16 should
33328 // work just as well.
33329 if (!VT.isVector() || !VT.isSimple() ||
33330 !(VT.getVectorElementType() == MVT::i32))
33333 unsigned RegSize = 128;
33334 if (Subtarget.hasBWI())
33336 else if (Subtarget.hasAVX2())
33339 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
33340 // TODO: We should be able to handle larger vectors by splitting them before
33341 // feeding them into several SADs, and then reducing over those.
33342 if (VT.getSizeInBits() / 4 > RegSize)
33345 // We know N is a reduction add, which means one of its operands is a phi.
33346 // To match SAD, we need the other operand to be a vector select.
33347 SDValue SelectOp, Phi;
33348 if (Op0.getOpcode() == ISD::VSELECT) {
33351 } else if (Op1.getOpcode() == ISD::VSELECT) {
33357 // Check whether we have an abs-diff pattern feeding into the select.
33358 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
33361 // SAD pattern detected. Now build a SAD instruction and an addition for
33362 // reduction. Note that the number of elements of the result of SAD is less
33363 // than the number of elements of its input. Therefore, we could only update
33364 // part of elements in the reduction vector.
33365 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
33367 // The output of PSADBW is a vector of i64.
33368 // We need to turn the vector of i64 into a vector of i32.
33369 // If the reduction vector is at least as wide as the psadbw result, just
33370 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
33372 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
33373 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
33374 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
33376 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
33378 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
33379 // Update part of elements of the reduction vector. This is done by first
33380 // extracting a sub-vector from it, updating this sub-vector, and inserting
33382 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
33383 DAG.getIntPtrConstant(0, DL));
33384 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
33385 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
33386 DAG.getIntPtrConstant(0, DL));
33388 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
33391 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
33392 const X86Subtarget &Subtarget) {
33393 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
33394 if (Flags->hasVectorReduction()) {
33395 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
33398 EVT VT = N->getValueType(0);
33399 SDValue Op0 = N->getOperand(0);
33400 SDValue Op1 = N->getOperand(1);
33402 // Try to synthesize horizontal adds from adds of shuffles.
33403 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33404 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33405 isHorizontalBinOp(Op0, Op1, true))
33406 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
33408 return OptimizeConditionalInDecrement(N, DAG);
33411 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
33412 const X86Subtarget &Subtarget) {
33413 SDValue Op0 = N->getOperand(0);
33414 SDValue Op1 = N->getOperand(1);
33416 // X86 can't encode an immediate LHS of a sub. See if we can push the
33417 // negation into a preceding instruction.
33418 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
33419 // If the RHS of the sub is a XOR with one use and a constant, invert the
33420 // immediate. Then add one to the LHS of the sub so we can turn
33421 // X-Y -> X+~Y+1, saving one register.
33422 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
33423 isa<ConstantSDNode>(Op1.getOperand(1))) {
33424 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
33425 EVT VT = Op0.getValueType();
33426 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
33428 DAG.getConstant(~XorC, SDLoc(Op1), VT));
33429 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
33430 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
33434 // Try to synthesize horizontal adds from adds of shuffles.
33435 EVT VT = N->getValueType(0);
33436 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33437 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33438 isHorizontalBinOp(Op0, Op1, true))
33439 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
33441 return OptimizeConditionalInDecrement(N, DAG);
33444 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
33445 TargetLowering::DAGCombinerInfo &DCI,
33446 const X86Subtarget &Subtarget) {
33448 unsigned Opcode = N->getOpcode();
33449 MVT VT = N->getSimpleValueType(0);
33450 MVT SVT = VT.getVectorElementType();
33451 SDValue Op = N->getOperand(0);
33452 MVT OpVT = Op.getSimpleValueType();
33453 MVT OpEltVT = OpVT.getVectorElementType();
33454 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
33456 // Perform any constant folding.
33457 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
33458 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
33459 unsigned NumDstElts = VT.getVectorNumElements();
33460 SmallBitVector Undefs(NumDstElts, false);
33461 SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
33462 for (unsigned i = 0; i != NumDstElts; ++i) {
33463 SDValue OpElt = Op.getOperand(i);
33464 if (OpElt.getOpcode() == ISD::UNDEF) {
33468 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
33469 Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
33470 : Cst.sextOrTrunc(SVT.getSizeInBits());
33472 return getConstVector(Vals, Undefs, VT, DAG, DL);
33475 // (vzext (bitcast (vzext (x)) -> (vzext x)
33476 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
33477 SDValue V = peekThroughBitcasts(Op);
33478 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
33479 MVT InnerVT = V.getSimpleValueType();
33480 MVT InnerEltVT = InnerVT.getVectorElementType();
33482 // If the element sizes match exactly, we can just do one larger vzext. This
33483 // is always an exact type match as vzext operates on integer types.
33484 if (OpEltVT == InnerEltVT) {
33485 assert(OpVT == InnerVT && "Types must match for vzext!");
33486 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
33489 // The only other way we can combine them is if only a single element of the
33490 // inner vzext is used in the input to the outer vzext.
33491 if (InnerEltVT.getSizeInBits() < InputBits)
33494 // In this case, the inner vzext is completely dead because we're going to
33495 // only look at bits inside of the low element. Just do the outer vzext on
33496 // a bitcast of the input to the inner.
33497 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
33500 // Check if we can bypass extracting and re-inserting an element of an input
33501 // vector. Essentially:
33502 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
33503 // TODO: Add X86ISD::VSEXT support
33504 if (Opcode == X86ISD::VZEXT &&
33505 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33506 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33507 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
33508 SDValue ExtractedV = V.getOperand(0);
33509 SDValue OrigV = ExtractedV.getOperand(0);
33510 if (isNullConstant(ExtractedV.getOperand(1))) {
33511 MVT OrigVT = OrigV.getSimpleValueType();
33512 // Extract a subvector if necessary...
33513 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
33514 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
33515 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
33516 OrigVT.getVectorNumElements() / Ratio);
33517 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
33518 DAG.getIntPtrConstant(0, DL));
33520 Op = DAG.getBitcast(OpVT, OrigV);
33521 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
33528 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
33529 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
33530 const X86Subtarget &Subtarget) {
33531 SDValue Chain = N->getOperand(0);
33532 SDValue LHS = N->getOperand(1);
33533 SDValue RHS = N->getOperand(2);
33534 MVT VT = RHS.getSimpleValueType();
33537 auto *C = dyn_cast<ConstantSDNode>(RHS);
33538 if (!C || C->getZExtValue() != 1)
33541 RHS = DAG.getConstant(-1, DL, VT);
33542 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33543 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
33544 DAG.getVTList(MVT::i32, MVT::Other),
33545 {Chain, LHS, RHS}, VT, MMO);
33548 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
33549 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
33550 SDValue Op0 = N->getOperand(0);
33551 SDValue Op1 = N->getOperand(1);
33553 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
33556 EVT VT = N->getValueType(0);
33559 return DAG.getNode(X86ISD::TESTM, DL, VT,
33560 Op0->getOperand(0), Op0->getOperand(1));
33563 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
33564 const X86Subtarget &Subtarget) {
33565 MVT VT = N->getSimpleValueType(0);
33568 if (N->getOperand(0) == N->getOperand(1)) {
33569 if (N->getOpcode() == X86ISD::PCMPEQ)
33570 return getOnesVector(VT, Subtarget, DAG, DL);
33571 if (N->getOpcode() == X86ISD::PCMPGT)
33572 return getZeroVector(VT, Subtarget, DAG, DL);
33579 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
33580 DAGCombinerInfo &DCI) const {
33581 SelectionDAG &DAG = DCI.DAG;
33582 switch (N->getOpcode()) {
33584 case ISD::EXTRACT_VECTOR_ELT:
33585 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
33588 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
33589 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
33590 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
33591 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
33592 case ISD::SUB: return combineSub(N, DAG, Subtarget);
33593 case X86ISD::ADC: return combineADC(N, DAG, DCI);
33594 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
33597 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
33598 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
33599 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
33600 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
33601 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
33602 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
33603 case ISD::STORE: return combineStore(N, DAG, Subtarget);
33604 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
33605 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
33606 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
33608 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
33609 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
33610 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
33611 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
33612 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
33614 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
33616 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
33618 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
33619 case X86ISD::BT: return combineBT(N, DAG, DCI);
33620 case ISD::ANY_EXTEND:
33621 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
33622 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
33623 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
33624 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
33625 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
33626 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
33627 case X86ISD::VSHLI:
33628 case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
33629 case X86ISD::VSEXT:
33630 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
33631 case X86ISD::SHUFP: // Handle all target specific shuffles
33632 case X86ISD::INSERTPS:
33633 case X86ISD::PALIGNR:
33634 case X86ISD::VSHLDQ:
33635 case X86ISD::VSRLDQ:
33636 case X86ISD::BLENDI:
33637 case X86ISD::UNPCKH:
33638 case X86ISD::UNPCKL:
33639 case X86ISD::MOVHLPS:
33640 case X86ISD::MOVLHPS:
33641 case X86ISD::PSHUFB:
33642 case X86ISD::PSHUFD:
33643 case X86ISD::PSHUFHW:
33644 case X86ISD::PSHUFLW:
33645 case X86ISD::MOVSHDUP:
33646 case X86ISD::MOVSLDUP:
33647 case X86ISD::MOVDDUP:
33648 case X86ISD::MOVSS:
33649 case X86ISD::MOVSD:
33650 case X86ISD::VPPERM:
33651 case X86ISD::VPERMI:
33652 case X86ISD::VPERMV:
33653 case X86ISD::VPERMV3:
33654 case X86ISD::VPERMIV3:
33655 case X86ISD::VPERMIL2:
33656 case X86ISD::VPERMILPI:
33657 case X86ISD::VPERMILPV:
33658 case X86ISD::VPERM2X128:
33659 case X86ISD::VZEXT_MOVL:
33660 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
33661 case X86ISD::FMADD:
33662 case X86ISD::FMADD_RND:
33663 case X86ISD::FMADDS1_RND:
33664 case X86ISD::FMADDS3_RND:
33665 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
33667 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
33668 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
33669 case X86ISD::TESTM: return combineTestM(N, DAG);
33670 case X86ISD::PCMPEQ:
33671 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
33677 /// Return true if the target has native support for the specified value type
33678 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
33679 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
33680 /// some i16 instructions are slow.
33681 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
33682 if (!isTypeLegal(VT))
33684 if (VT != MVT::i16)
33691 case ISD::SIGN_EXTEND:
33692 case ISD::ZERO_EXTEND:
33693 case ISD::ANY_EXTEND:
33706 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
33707 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
33708 /// we don't adjust the stack we clobber the first frame index.
33709 /// See X86InstrInfo::copyPhysReg.
33710 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
33711 MachineFunction *MF) const {
33712 const MachineRegisterInfo &MRI = MF->getRegInfo();
33714 return any_of(MRI.reg_instructions(X86::EFLAGS),
33715 [](const MachineInstr &RI) { return RI.isCopy(); });
33718 /// This method query the target whether it is beneficial for dag combiner to
33719 /// promote the specified node. If true, it should return the desired promotion
33720 /// type by reference.
33721 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
33722 EVT VT = Op.getValueType();
33723 if (VT != MVT::i16)
33726 bool Promote = false;
33727 bool Commute = false;
33728 switch (Op.getOpcode()) {
33730 case ISD::SIGN_EXTEND:
33731 case ISD::ZERO_EXTEND:
33732 case ISD::ANY_EXTEND:
33737 SDValue N0 = Op.getOperand(0);
33738 // Look out for (store (shl (load), x)).
33739 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
33752 SDValue N0 = Op.getOperand(0);
33753 SDValue N1 = Op.getOperand(1);
33754 if (!Commute && MayFoldLoad(N1))
33756 // Avoid disabling potential load folding opportunities.
33757 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
33759 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
33769 //===----------------------------------------------------------------------===//
33770 // X86 Inline Assembly Support
33771 //===----------------------------------------------------------------------===//
33773 // Helper to match a string separated by whitespace.
33774 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
33775 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
33777 for (StringRef Piece : Pieces) {
33778 if (!S.startswith(Piece)) // Check if the piece matches.
33781 S = S.substr(Piece.size());
33782 StringRef::size_type Pos = S.find_first_not_of(" \t");
33783 if (Pos == 0) // We matched a prefix.
33792 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
33794 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
33795 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
33796 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
33797 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
33799 if (AsmPieces.size() == 3)
33801 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
33808 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
33809 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
33811 const std::string &AsmStr = IA->getAsmString();
33813 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
33814 if (!Ty || Ty->getBitWidth() % 16 != 0)
33817 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
33818 SmallVector<StringRef, 4> AsmPieces;
33819 SplitString(AsmStr, AsmPieces, ";\n");
33821 switch (AsmPieces.size()) {
33822 default: return false;
33824 // FIXME: this should verify that we are targeting a 486 or better. If not,
33825 // we will turn this bswap into something that will be lowered to logical
33826 // ops instead of emitting the bswap asm. For now, we don't support 486 or
33827 // lower so don't worry about this.
33829 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
33830 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
33831 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
33832 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
33833 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
33834 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
33835 // No need to check constraints, nothing other than the equivalent of
33836 // "=r,0" would be valid here.
33837 return IntrinsicLowering::LowerToByteSwap(CI);
33840 // rorw $$8, ${0:w} --> llvm.bswap.i16
33841 if (CI->getType()->isIntegerTy(16) &&
33842 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33843 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
33844 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
33846 StringRef ConstraintsStr = IA->getConstraintString();
33847 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33848 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33849 if (clobbersFlagRegisters(AsmPieces))
33850 return IntrinsicLowering::LowerToByteSwap(CI);
33854 if (CI->getType()->isIntegerTy(32) &&
33855 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33856 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
33857 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
33858 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
33860 StringRef ConstraintsStr = IA->getConstraintString();
33861 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33862 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33863 if (clobbersFlagRegisters(AsmPieces))
33864 return IntrinsicLowering::LowerToByteSwap(CI);
33867 if (CI->getType()->isIntegerTy(64)) {
33868 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
33869 if (Constraints.size() >= 2 &&
33870 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
33871 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
33872 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
33873 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
33874 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
33875 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
33876 return IntrinsicLowering::LowerToByteSwap(CI);
33884 /// Given a constraint letter, return the type of constraint for this target.
33885 X86TargetLowering::ConstraintType
33886 X86TargetLowering::getConstraintType(StringRef Constraint) const {
33887 if (Constraint.size() == 1) {
33888 switch (Constraint[0]) {
33900 return C_RegisterClass;
33901 case 'k': // AVX512 masking registers.
33925 else if (Constraint.size() == 2) {
33926 switch (Constraint[0]) {
33930 switch (Constraint[1]) {
33938 return TargetLowering::getConstraintType(Constraint);
33941 /// Examine constraint type and operand type and determine a weight value.
33942 /// This object must already have been set up with the operand type
33943 /// and the current alternative constraint selected.
33944 TargetLowering::ConstraintWeight
33945 X86TargetLowering::getSingleConstraintMatchWeight(
33946 AsmOperandInfo &info, const char *constraint) const {
33947 ConstraintWeight weight = CW_Invalid;
33948 Value *CallOperandVal = info.CallOperandVal;
33949 // If we don't have a value, we can't do a match,
33950 // but allow it at the lowest weight.
33951 if (!CallOperandVal)
33953 Type *type = CallOperandVal->getType();
33954 // Look at the constraint type.
33955 switch (*constraint) {
33957 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
33968 if (CallOperandVal->getType()->isIntegerTy())
33969 weight = CW_SpecificReg;
33974 if (type->isFloatingPointTy())
33975 weight = CW_SpecificReg;
33978 if (type->isX86_MMXTy() && Subtarget.hasMMX())
33979 weight = CW_SpecificReg;
33982 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
33983 if (constraint[1] == 'k') {
33984 // Support for 'Yk' (similarly to the 'k' variant below).
33985 weight = CW_SpecificReg;
33988 // Else fall through (handle "Y" constraint).
33991 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
33992 weight = CW_Register;
33995 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
33996 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
33997 weight = CW_Register;
34000 // Enable conditional vector operations using %k<#> registers.
34001 weight = CW_SpecificReg;
34004 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
34005 if (C->getZExtValue() <= 31)
34006 weight = CW_Constant;
34010 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34011 if (C->getZExtValue() <= 63)
34012 weight = CW_Constant;
34016 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34017 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
34018 weight = CW_Constant;
34022 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34023 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
34024 weight = CW_Constant;
34028 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34029 if (C->getZExtValue() <= 3)
34030 weight = CW_Constant;
34034 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34035 if (C->getZExtValue() <= 0xff)
34036 weight = CW_Constant;
34041 if (isa<ConstantFP>(CallOperandVal)) {
34042 weight = CW_Constant;
34046 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34047 if ((C->getSExtValue() >= -0x80000000LL) &&
34048 (C->getSExtValue() <= 0x7fffffffLL))
34049 weight = CW_Constant;
34053 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34054 if (C->getZExtValue() <= 0xffffffff)
34055 weight = CW_Constant;
34062 /// Try to replace an X constraint, which matches anything, with another that
34063 /// has more specific requirements based on the type of the corresponding
34065 const char *X86TargetLowering::
34066 LowerXConstraint(EVT ConstraintVT) const {
34067 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
34068 // 'f' like normal targets.
34069 if (ConstraintVT.isFloatingPoint()) {
34070 if (Subtarget.hasSSE2())
34072 if (Subtarget.hasSSE1())
34076 return TargetLowering::LowerXConstraint(ConstraintVT);
34079 /// Lower the specified operand into the Ops vector.
34080 /// If it is invalid, don't add anything to Ops.
34081 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
34082 std::string &Constraint,
34083 std::vector<SDValue>&Ops,
34084 SelectionDAG &DAG) const {
34087 // Only support length 1 constraints for now.
34088 if (Constraint.length() > 1) return;
34090 char ConstraintLetter = Constraint[0];
34091 switch (ConstraintLetter) {
34094 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34095 if (C->getZExtValue() <= 31) {
34096 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34097 Op.getValueType());
34103 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34104 if (C->getZExtValue() <= 63) {
34105 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34106 Op.getValueType());
34112 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34113 if (isInt<8>(C->getSExtValue())) {
34114 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34115 Op.getValueType());
34121 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34122 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
34123 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
34124 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
34125 Op.getValueType());
34131 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34132 if (C->getZExtValue() <= 3) {
34133 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34134 Op.getValueType());
34140 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34141 if (C->getZExtValue() <= 255) {
34142 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34143 Op.getValueType());
34149 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34150 if (C->getZExtValue() <= 127) {
34151 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34152 Op.getValueType());
34158 // 32-bit signed value
34159 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34160 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34161 C->getSExtValue())) {
34162 // Widen to 64 bits here to get it sign extended.
34163 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
34166 // FIXME gcc accepts some relocatable values here too, but only in certain
34167 // memory models; it's complicated.
34172 // 32-bit unsigned value
34173 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34174 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34175 C->getZExtValue())) {
34176 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34177 Op.getValueType());
34181 // FIXME gcc accepts some relocatable values here too, but only in certain
34182 // memory models; it's complicated.
34186 // Literal immediates are always ok.
34187 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
34188 // Widen to 64 bits here to get it sign extended.
34189 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
34193 // In any sort of PIC mode addresses need to be computed at runtime by
34194 // adding in a register or some sort of table lookup. These can't
34195 // be used as immediates.
34196 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
34199 // If we are in non-pic codegen mode, we allow the address of a global (with
34200 // an optional displacement) to be used with 'i'.
34201 GlobalAddressSDNode *GA = nullptr;
34202 int64_t Offset = 0;
34204 // Match either (GA), (GA+C), (GA+C1+C2), etc.
34206 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
34207 Offset += GA->getOffset();
34209 } else if (Op.getOpcode() == ISD::ADD) {
34210 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34211 Offset += C->getZExtValue();
34212 Op = Op.getOperand(0);
34215 } else if (Op.getOpcode() == ISD::SUB) {
34216 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34217 Offset += -C->getZExtValue();
34218 Op = Op.getOperand(0);
34223 // Otherwise, this isn't something we can handle, reject it.
34227 const GlobalValue *GV = GA->getGlobal();
34228 // If we require an extra load to get this address, as in PIC mode, we
34229 // can't accept it.
34230 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
34233 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
34234 GA->getValueType(0), Offset);
34239 if (Result.getNode()) {
34240 Ops.push_back(Result);
34243 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
34246 /// Check if \p RC is a general purpose register class.
34247 /// I.e., GR* or one of their variant.
34248 static bool isGRClass(const TargetRegisterClass &RC) {
34249 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
34250 RC.hasSuperClassEq(&X86::GR16RegClass) ||
34251 RC.hasSuperClassEq(&X86::GR32RegClass) ||
34252 RC.hasSuperClassEq(&X86::GR64RegClass) ||
34253 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
34256 /// Check if \p RC is a vector register class.
34257 /// I.e., FR* / VR* or one of their variant.
34258 static bool isFRClass(const TargetRegisterClass &RC) {
34259 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
34260 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
34261 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
34262 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
34263 RC.hasSuperClassEq(&X86::VR512RegClass);
34266 std::pair<unsigned, const TargetRegisterClass *>
34267 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
34268 StringRef Constraint,
34270 // First, see if this is a constraint that directly corresponds to an LLVM
34272 if (Constraint.size() == 1) {
34273 // GCC Constraint Letters
34274 switch (Constraint[0]) {
34276 // TODO: Slight differences here in allocation order and leaving
34277 // RIP in the class. Do they matter any more here than they do
34278 // in the normal allocation?
34280 if (Subtarget.hasAVX512()) {
34281 // Only supported in AVX512 or later.
34282 switch (VT.SimpleTy) {
34285 return std::make_pair(0U, &X86::VK32RegClass);
34287 return std::make_pair(0U, &X86::VK16RegClass);
34289 return std::make_pair(0U, &X86::VK8RegClass);
34291 return std::make_pair(0U, &X86::VK1RegClass);
34293 return std::make_pair(0U, &X86::VK64RegClass);
34297 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
34298 if (Subtarget.is64Bit()) {
34299 if (VT == MVT::i32 || VT == MVT::f32)
34300 return std::make_pair(0U, &X86::GR32RegClass);
34301 if (VT == MVT::i16)
34302 return std::make_pair(0U, &X86::GR16RegClass);
34303 if (VT == MVT::i8 || VT == MVT::i1)
34304 return std::make_pair(0U, &X86::GR8RegClass);
34305 if (VT == MVT::i64 || VT == MVT::f64)
34306 return std::make_pair(0U, &X86::GR64RegClass);
34309 // 32-bit fallthrough
34310 case 'Q': // Q_REGS
34311 if (VT == MVT::i32 || VT == MVT::f32)
34312 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
34313 if (VT == MVT::i16)
34314 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
34315 if (VT == MVT::i8 || VT == MVT::i1)
34316 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
34317 if (VT == MVT::i64)
34318 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
34320 case 'r': // GENERAL_REGS
34321 case 'l': // INDEX_REGS
34322 if (VT == MVT::i8 || VT == MVT::i1)
34323 return std::make_pair(0U, &X86::GR8RegClass);
34324 if (VT == MVT::i16)
34325 return std::make_pair(0U, &X86::GR16RegClass);
34326 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
34327 return std::make_pair(0U, &X86::GR32RegClass);
34328 return std::make_pair(0U, &X86::GR64RegClass);
34329 case 'R': // LEGACY_REGS
34330 if (VT == MVT::i8 || VT == MVT::i1)
34331 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
34332 if (VT == MVT::i16)
34333 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
34334 if (VT == MVT::i32 || !Subtarget.is64Bit())
34335 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
34336 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
34337 case 'f': // FP Stack registers.
34338 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
34339 // value to the correct fpstack register class.
34340 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
34341 return std::make_pair(0U, &X86::RFP32RegClass);
34342 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
34343 return std::make_pair(0U, &X86::RFP64RegClass);
34344 return std::make_pair(0U, &X86::RFP80RegClass);
34345 case 'y': // MMX_REGS if MMX allowed.
34346 if (!Subtarget.hasMMX()) break;
34347 return std::make_pair(0U, &X86::VR64RegClass);
34348 case 'Y': // SSE_REGS if SSE2 allowed
34349 if (!Subtarget.hasSSE2()) break;
34352 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
34353 if (!Subtarget.hasSSE1()) break;
34354 bool VConstraint = (Constraint[0] == 'v');
34356 switch (VT.SimpleTy) {
34358 // Scalar SSE types.
34361 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
34362 return std::make_pair(0U, &X86::FR32XRegClass);
34363 return std::make_pair(0U, &X86::FR32RegClass);
34366 if (VConstraint && Subtarget.hasVLX())
34367 return std::make_pair(0U, &X86::FR64XRegClass);
34368 return std::make_pair(0U, &X86::FR64RegClass);
34369 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34377 if (VConstraint && Subtarget.hasVLX())
34378 return std::make_pair(0U, &X86::VR128XRegClass);
34379 return std::make_pair(0U, &X86::VR128RegClass);
34387 if (VConstraint && Subtarget.hasVLX())
34388 return std::make_pair(0U, &X86::VR256XRegClass);
34389 return std::make_pair(0U, &X86::VR256RegClass);
34394 return std::make_pair(0U, &X86::VR512RegClass);
34398 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
34399 switch (Constraint[1]) {
34403 // This register class doesn't allocate k0 for masked vector operation.
34404 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
34405 switch (VT.SimpleTy) {
34408 return std::make_pair(0U, &X86::VK32WMRegClass);
34410 return std::make_pair(0U, &X86::VK16WMRegClass);
34412 return std::make_pair(0U, &X86::VK8WMRegClass);
34414 return std::make_pair(0U, &X86::VK1WMRegClass);
34416 return std::make_pair(0U, &X86::VK64WMRegClass);
34423 // Use the default implementation in TargetLowering to convert the register
34424 // constraint into a member of a register class.
34425 std::pair<unsigned, const TargetRegisterClass*> Res;
34426 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
34428 // Not found as a standard register?
34430 // Map st(0) -> st(7) -> ST0
34431 if (Constraint.size() == 7 && Constraint[0] == '{' &&
34432 tolower(Constraint[1]) == 's' &&
34433 tolower(Constraint[2]) == 't' &&
34434 Constraint[3] == '(' &&
34435 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
34436 Constraint[5] == ')' &&
34437 Constraint[6] == '}') {
34439 Res.first = X86::FP0+Constraint[4]-'0';
34440 Res.second = &X86::RFP80RegClass;
34444 // GCC allows "st(0)" to be called just plain "st".
34445 if (StringRef("{st}").equals_lower(Constraint)) {
34446 Res.first = X86::FP0;
34447 Res.second = &X86::RFP80RegClass;
34452 if (StringRef("{flags}").equals_lower(Constraint)) {
34453 Res.first = X86::EFLAGS;
34454 Res.second = &X86::CCRRegClass;
34458 // 'A' means EAX + EDX.
34459 if (Constraint == "A") {
34460 Res.first = X86::EAX;
34461 Res.second = &X86::GR32_ADRegClass;
34467 // Otherwise, check to see if this is a register class of the wrong value
34468 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
34469 // turn into {ax},{dx}.
34470 // MVT::Other is used to specify clobber names.
34471 if (Res.second->hasType(VT) || VT == MVT::Other)
34472 return Res; // Correct type already, nothing to do.
34474 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
34475 // return "eax". This should even work for things like getting 64bit integer
34476 // registers when given an f64 type.
34477 const TargetRegisterClass *Class = Res.second;
34478 // The generic code will match the first register class that contains the
34479 // given register. Thus, based on the ordering of the tablegened file,
34480 // the "plain" GR classes might not come first.
34481 // Therefore, use a helper method.
34482 if (isGRClass(*Class)) {
34483 unsigned Size = VT.getSizeInBits();
34484 if (Size == 1) Size = 8;
34485 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
34487 Res.first = DestReg;
34488 Res.second = Size == 8 ? &X86::GR8RegClass
34489 : Size == 16 ? &X86::GR16RegClass
34490 : Size == 32 ? &X86::GR32RegClass
34491 : &X86::GR64RegClass;
34492 assert(Res.second->contains(Res.first) && "Register in register class");
34494 // No register found/type mismatch.
34496 Res.second = nullptr;
34498 } else if (isFRClass(*Class)) {
34499 // Handle references to XMM physical registers that got mapped into the
34500 // wrong class. This can happen with constraints like {xmm0} where the
34501 // target independent register mapper will just pick the first match it can
34502 // find, ignoring the required type.
34504 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34505 if (VT == MVT::f32 || VT == MVT::i32)
34506 Res.second = &X86::FR32RegClass;
34507 else if (VT == MVT::f64 || VT == MVT::i64)
34508 Res.second = &X86::FR64RegClass;
34509 else if (X86::VR128RegClass.hasType(VT))
34510 Res.second = &X86::VR128RegClass;
34511 else if (X86::VR256RegClass.hasType(VT))
34512 Res.second = &X86::VR256RegClass;
34513 else if (X86::VR512RegClass.hasType(VT))
34514 Res.second = &X86::VR512RegClass;
34516 // Type mismatch and not a clobber: Return an error;
34518 Res.second = nullptr;
34525 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
34526 const AddrMode &AM, Type *Ty,
34527 unsigned AS) const {
34528 // Scaling factors are not free at all.
34529 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
34530 // will take 2 allocations in the out of order engine instead of 1
34531 // for plain addressing mode, i.e. inst (reg1).
34533 // vaddps (%rsi,%drx), %ymm0, %ymm1
34534 // Requires two allocations (one for the load, one for the computation)
34536 // vaddps (%rsi), %ymm0, %ymm1
34537 // Requires just 1 allocation, i.e., freeing allocations for other operations
34538 // and having less micro operations to execute.
34540 // For some X86 architectures, this is even worse because for instance for
34541 // stores, the complex addressing mode forces the instruction to use the
34542 // "load" ports instead of the dedicated "store" port.
34543 // E.g., on Haswell:
34544 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
34545 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
34546 if (isLegalAddressingMode(DL, AM, Ty, AS))
34547 // Scale represents reg2 * scale, thus account for 1
34548 // as soon as we use a second register.
34549 return AM.Scale != 0;
34553 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
34554 // Integer division on x86 is expensive. However, when aggressively optimizing
34555 // for code size, we prefer to use a div instruction, as it is usually smaller
34556 // than the alternative sequence.
34557 // The exception to this is vector division. Since x86 doesn't have vector
34558 // integer division, leaving the division as-is is a loss even in terms of
34559 // size, because it will have to be scalarized, while the alternative code
34560 // sequence can be performed in vector form.
34561 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
34562 Attribute::MinSize);
34563 return OptSize && !VT.isVector();
34566 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
34567 if (!Subtarget.is64Bit())
34570 // Update IsSplitCSR in X86MachineFunctionInfo.
34571 X86MachineFunctionInfo *AFI =
34572 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
34573 AFI->setIsSplitCSR(true);
34576 void X86TargetLowering::insertCopiesSplitCSR(
34577 MachineBasicBlock *Entry,
34578 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
34579 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34580 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
34584 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34585 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
34586 MachineBasicBlock::iterator MBBI = Entry->begin();
34587 for (const MCPhysReg *I = IStart; *I; ++I) {
34588 const TargetRegisterClass *RC = nullptr;
34589 if (X86::GR64RegClass.contains(*I))
34590 RC = &X86::GR64RegClass;
34592 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
34594 unsigned NewVR = MRI->createVirtualRegister(RC);
34595 // Create copy from CSR to a virtual register.
34596 // FIXME: this currently does not emit CFI pseudo-instructions, it works
34597 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
34598 // nounwind. If we want to generalize this later, we may need to emit
34599 // CFI pseudo-instructions.
34600 assert(Entry->getParent()->getFunction()->hasFnAttribute(
34601 Attribute::NoUnwind) &&
34602 "Function should be nounwind in insertCopiesSplitCSR!");
34603 Entry->addLiveIn(*I);
34604 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
34607 // Insert the copy-back instructions right before the terminator.
34608 for (auto *Exit : Exits)
34609 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
34610 TII->get(TargetOpcode::COPY), *I)
34615 bool X86TargetLowering::supportSwiftError() const {
34616 return Subtarget.is64Bit();