1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetOptions.h"
63 #define DEBUG_TYPE "x86-isel"
65 STATISTIC(NumTailCalls, "Number of tail calls");
67 static cl::opt<bool> ExperimentalVectorWideningLegalization(
68 "x86-experimental-vector-widening-legalization", cl::init(false),
69 cl::desc("Enable an experimental vector type legalization through widening "
70 "rather than promotion."),
73 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
74 const X86Subtarget &STI)
75 : TargetLowering(TM), Subtarget(STI) {
76 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
77 X86ScalarSSEf64 = Subtarget.hasSSE2();
78 X86ScalarSSEf32 = Subtarget.hasSSE1();
79 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
81 // Set up the TargetLowering object.
83 // X86 is weird. It always uses i8 for shift amounts and setcc results.
84 setBooleanContents(ZeroOrOneBooleanContent);
85 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
86 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
88 // For 64-bit, since we have so many registers, use the ILP scheduler.
89 // For 32-bit, use the register pressure specific scheduling.
90 // For Atom, always use ILP scheduling.
91 if (Subtarget.isAtom())
92 setSchedulingPreference(Sched::ILP);
93 else if (Subtarget.is64Bit())
94 setSchedulingPreference(Sched::ILP);
96 setSchedulingPreference(Sched::RegPressure);
97 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
98 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
100 // Bypass expensive divides on Atom when compiling with O2.
101 if (TM.getOptLevel() >= CodeGenOpt::Default) {
102 if (Subtarget.hasSlowDivide32())
103 addBypassSlowDiv(32, 8);
104 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
105 addBypassSlowDiv(64, 16);
108 if (Subtarget.isTargetKnownWindowsMSVC() ||
109 Subtarget.isTargetWindowsItanium()) {
110 // Setup Windows compiler runtime calls.
111 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
112 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
113 setLibcallName(RTLIB::SREM_I64, "_allrem");
114 setLibcallName(RTLIB::UREM_I64, "_aullrem");
115 setLibcallName(RTLIB::MUL_I64, "_allmul");
116 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
119 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
120 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
123 if (Subtarget.isTargetDarwin()) {
124 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
125 setUseUnderscoreSetJmp(false);
126 setUseUnderscoreLongJmp(false);
127 } else if (Subtarget.isTargetWindowsGNU()) {
128 // MS runtime is weird: it exports _setjmp, but longjmp!
129 setUseUnderscoreSetJmp(true);
130 setUseUnderscoreLongJmp(false);
132 setUseUnderscoreSetJmp(true);
133 setUseUnderscoreLongJmp(true);
136 // Set up the register classes.
137 addRegisterClass(MVT::i8, &X86::GR8RegClass);
138 addRegisterClass(MVT::i16, &X86::GR16RegClass);
139 addRegisterClass(MVT::i32, &X86::GR32RegClass);
140 if (Subtarget.is64Bit())
141 addRegisterClass(MVT::i64, &X86::GR64RegClass);
143 for (MVT VT : MVT::integer_valuetypes())
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
146 // We don't accept any truncstore of integer registers.
147 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
148 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
151 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
152 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
154 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
156 // SETOEQ and SETUNE require checking two conditions.
157 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
158 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
159 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
161 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
162 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
164 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
166 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
167 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
168 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
170 if (Subtarget.is64Bit()) {
171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
172 // f32/f64 are legal, f80 is custom.
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
177 } else if (!Subtarget.useSoftFloat()) {
178 // We have an algorithm for SSE2->double, and we turn this into a
179 // 64-bit FILD followed by conditional FADD for other targets.
180 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
181 // We have an algorithm for SSE2, and we turn this into a 64-bit
182 // FILD or VCVTUSI2SS/SD for other targets.
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
186 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
188 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
189 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
191 if (!Subtarget.useSoftFloat()) {
192 // SSE has no i16 to fp conversion, only i32.
193 if (X86ScalarSSEf32) {
194 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
195 // f32 and f64 cases are Legal, f80 case is not
196 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
198 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
199 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
206 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
208 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
209 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
211 if (!Subtarget.useSoftFloat()) {
212 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
213 // are Legal, f80 is custom lowered.
214 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
215 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
217 if (X86ScalarSSEf32) {
218 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
219 // f32 and f64 cases are Legal, f80 case is not
220 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
222 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
223 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
228 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
231 // Handle FP_TO_UINT by promoting the destination to a larger signed
233 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
234 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
235 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
237 if (Subtarget.is64Bit()) {
238 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
239 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
240 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
241 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
244 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
246 } else if (!Subtarget.useSoftFloat()) {
247 // Since AVX is a superset of SSE3, only check for SSE here.
248 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
249 // Expand FP_TO_UINT into a select.
250 // FIXME: We would like to use a Custom expander here eventually to do
251 // the optimal thing for SSE vs. the default expansion in the legalizer.
252 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
254 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
255 // With SSE3 we can use fisttpll to convert to a signed i64; without
256 // SSE, we're stuck with a fistpll.
257 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
262 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
263 if (!X86ScalarSSEf64) {
264 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
265 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
266 if (Subtarget.is64Bit()) {
267 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
268 // Without SSE, i64->f64 goes through memory.
269 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
271 } else if (!Subtarget.is64Bit())
272 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
274 // Scalar integer divide and remainder are lowered to use operations that
275 // produce two results, to match the available instructions. This exposes
276 // the two-result form to trivial CSE, which is able to combine x/y and x%y
277 // into a single instruction.
279 // Scalar integer multiply-high is also lowered to use two-result
280 // operations, to match the available instructions. However, plain multiply
281 // (low) operations are left as Legal, as there are single-result
282 // instructions for this in x86. Using the two-result multiply instructions
283 // when both high and low results are needed must be arranged by dagcombine.
284 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
285 setOperationAction(ISD::MULHS, VT, Expand);
286 setOperationAction(ISD::MULHU, VT, Expand);
287 setOperationAction(ISD::SDIV, VT, Expand);
288 setOperationAction(ISD::UDIV, VT, Expand);
289 setOperationAction(ISD::SREM, VT, Expand);
290 setOperationAction(ISD::UREM, VT, Expand);
293 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
294 if (VT == MVT::i64 && !Subtarget.is64Bit())
296 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
297 setOperationAction(ISD::ADDC, VT, Custom);
298 setOperationAction(ISD::ADDE, VT, Custom);
299 setOperationAction(ISD::SUBC, VT, Custom);
300 setOperationAction(ISD::SUBE, VT, Custom);
303 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
304 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
305 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
306 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
307 setOperationAction(ISD::BR_CC, VT, Expand);
308 setOperationAction(ISD::SELECT_CC, VT, Expand);
310 if (Subtarget.is64Bit())
311 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
312 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
313 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
314 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
315 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
317 setOperationAction(ISD::FREM , MVT::f32 , Expand);
318 setOperationAction(ISD::FREM , MVT::f64 , Expand);
319 setOperationAction(ISD::FREM , MVT::f80 , Expand);
320 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
322 // Promote the i8 variants and force them on up to i32 which has a shorter
324 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
325 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
326 if (!Subtarget.hasBMI()) {
327 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
328 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
330 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
331 if (Subtarget.is64Bit()) {
332 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
333 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
337 if (Subtarget.hasLZCNT()) {
338 // When promoting the i8 variants, force them to i32 for a shorter
340 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
341 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
344 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
347 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
348 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355 // Special handling for half-precision floating point conversions.
356 // If we don't have F16C support, then lower half float conversions
357 // into library calls.
358 if (Subtarget.useSoftFloat() ||
359 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
360 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
364 // There's never any support for operations beyond MVT::f32.
365 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
366 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
367 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
368 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
370 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
371 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
372 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
373 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
374 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
375 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
377 if (Subtarget.hasPOPCNT()) {
378 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
380 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
381 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
382 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
383 if (Subtarget.is64Bit())
384 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
387 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
389 if (!Subtarget.hasMOVBE())
390 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
392 // These should be promoted to a larger select which is supported.
393 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
394 // X86 wants to expand cmov itself.
395 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
396 setOperationAction(ISD::SELECT, VT, Custom);
397 setOperationAction(ISD::SETCC, VT, Custom);
399 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
400 if (VT == MVT::i64 && !Subtarget.is64Bit())
402 setOperationAction(ISD::SELECT, VT, Custom);
403 setOperationAction(ISD::SETCC, VT, Custom);
404 setOperationAction(ISD::SETCCE, VT, Custom);
406 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
407 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
408 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
409 // support continuation, user-level threading, and etc.. As a result, no
410 // other SjLj exception interfaces are implemented and please don't build
411 // your own exception handling based on them.
412 // LLVM/Clang supports zero-cost DWARF exception handling.
413 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
414 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
415 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
416 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
417 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
420 for (auto VT : { MVT::i32, MVT::i64 }) {
421 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 setOperationAction(ISD::ConstantPool , VT, Custom);
424 setOperationAction(ISD::JumpTable , VT, Custom);
425 setOperationAction(ISD::GlobalAddress , VT, Custom);
426 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
427 setOperationAction(ISD::ExternalSymbol , VT, Custom);
428 setOperationAction(ISD::BlockAddress , VT, Custom);
430 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
431 for (auto VT : { MVT::i32, MVT::i64 }) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 setOperationAction(ISD::SHL_PARTS, VT, Custom);
435 setOperationAction(ISD::SRA_PARTS, VT, Custom);
436 setOperationAction(ISD::SRL_PARTS, VT, Custom);
439 if (Subtarget.hasSSE1())
440 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
442 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
444 // Expand certain atomics
445 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
446 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
447 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
448 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
449 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
450 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
451 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
452 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
455 if (Subtarget.hasCmpxchg16b()) {
456 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
459 // FIXME - use subtarget debug flags
460 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
461 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
462 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
463 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
466 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
467 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
469 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
470 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
472 setOperationAction(ISD::TRAP, MVT::Other, Legal);
473 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
475 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
476 setOperationAction(ISD::VASTART , MVT::Other, Custom);
477 setOperationAction(ISD::VAEND , MVT::Other, Expand);
478 bool Is64Bit = Subtarget.is64Bit();
479 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
480 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
482 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
483 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
485 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
487 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
488 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
489 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
491 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
492 // f32 and f64 use SSE.
493 // Set up the FP register classes.
494 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
495 : &X86::FR32RegClass);
496 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
497 : &X86::FR64RegClass);
499 for (auto VT : { MVT::f32, MVT::f64 }) {
500 // Use ANDPD to simulate FABS.
501 setOperationAction(ISD::FABS, VT, Custom);
503 // Use XORP to simulate FNEG.
504 setOperationAction(ISD::FNEG, VT, Custom);
506 // Use ANDPD and ORPD to simulate FCOPYSIGN.
507 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
509 // We don't support sin/cos/fmod
510 setOperationAction(ISD::FSIN , VT, Expand);
511 setOperationAction(ISD::FCOS , VT, Expand);
512 setOperationAction(ISD::FSINCOS, VT, Expand);
515 // Lower this to MOVMSK plus an AND.
516 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
517 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
519 // Expand FP immediates into loads from the stack, except for the special
521 addLegalFPImmediate(APFloat(+0.0)); // xorpd
522 addLegalFPImmediate(APFloat(+0.0f)); // xorps
523 } else if (UseX87 && X86ScalarSSEf32) {
524 // Use SSE for f32, x87 for f64.
525 // Set up the FP register classes.
526 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
527 : &X86::FR32RegClass);
528 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
530 // Use ANDPS to simulate FABS.
531 setOperationAction(ISD::FABS , MVT::f32, Custom);
533 // Use XORP to simulate FNEG.
534 setOperationAction(ISD::FNEG , MVT::f32, Custom);
536 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
538 // Use ANDPS and ORPS to simulate FCOPYSIGN.
539 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
540 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
542 // We don't support sin/cos/fmod
543 setOperationAction(ISD::FSIN , MVT::f32, Expand);
544 setOperationAction(ISD::FCOS , MVT::f32, Expand);
545 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
547 // Special cases we handle for FP constants.
548 addLegalFPImmediate(APFloat(+0.0f)); // xorps
549 addLegalFPImmediate(APFloat(+0.0)); // FLD0
550 addLegalFPImmediate(APFloat(+1.0)); // FLD1
551 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
552 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
554 if (!TM.Options.UnsafeFPMath) {
555 setOperationAction(ISD::FSIN , MVT::f64, Expand);
556 setOperationAction(ISD::FCOS , MVT::f64, Expand);
557 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
560 // f32 and f64 in x87.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
563 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
565 for (auto VT : { MVT::f32, MVT::f64 }) {
566 setOperationAction(ISD::UNDEF, VT, Expand);
567 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
569 if (!TM.Options.UnsafeFPMath) {
570 setOperationAction(ISD::FSIN , VT, Expand);
571 setOperationAction(ISD::FCOS , VT, Expand);
572 setOperationAction(ISD::FSINCOS, VT, Expand);
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
579 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
580 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
581 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
582 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
585 // We don't support FMA.
586 setOperationAction(ISD::FMA, MVT::f64, Expand);
587 setOperationAction(ISD::FMA, MVT::f32, Expand);
589 // Long double always uses X87, except f128 in MMX.
591 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
592 addRegisterClass(MVT::f128, &X86::FR128RegClass);
593 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
594 setOperationAction(ISD::FABS , MVT::f128, Custom);
595 setOperationAction(ISD::FNEG , MVT::f128, Custom);
596 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
599 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
600 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
601 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
603 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
604 addLegalFPImmediate(TmpFlt); // FLD0
606 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
609 APFloat TmpFlt2(+1.0);
610 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
612 addLegalFPImmediate(TmpFlt2); // FLD1
613 TmpFlt2.changeSign();
614 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
617 if (!TM.Options.UnsafeFPMath) {
618 setOperationAction(ISD::FSIN , MVT::f80, Expand);
619 setOperationAction(ISD::FCOS , MVT::f80, Expand);
620 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
623 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
624 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
625 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
626 setOperationAction(ISD::FRINT, MVT::f80, Expand);
627 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
628 setOperationAction(ISD::FMA, MVT::f80, Expand);
631 // Always use a library call for pow.
632 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
633 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
634 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
636 setOperationAction(ISD::FLOG, MVT::f80, Expand);
637 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
638 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
639 setOperationAction(ISD::FEXP, MVT::f80, Expand);
640 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
641 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
642 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
644 // Some FP actions are always expanded for vector types.
645 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
646 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
647 setOperationAction(ISD::FSIN, VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 setOperationAction(ISD::FCOS, VT, Expand);
650 setOperationAction(ISD::FREM, VT, Expand);
651 setOperationAction(ISD::FPOWI, VT, Expand);
652 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
653 setOperationAction(ISD::FPOW, VT, Expand);
654 setOperationAction(ISD::FLOG, VT, Expand);
655 setOperationAction(ISD::FLOG2, VT, Expand);
656 setOperationAction(ISD::FLOG10, VT, Expand);
657 setOperationAction(ISD::FEXP, VT, Expand);
658 setOperationAction(ISD::FEXP2, VT, Expand);
661 // First set operation action for all vector types to either promote
662 // (for widening) or expand (for scalarization). Then we will selectively
663 // turn on ones that can be effectively codegen'd.
664 for (MVT VT : MVT::vector_valuetypes()) {
665 setOperationAction(ISD::SDIV, VT, Expand);
666 setOperationAction(ISD::UDIV, VT, Expand);
667 setOperationAction(ISD::SREM, VT, Expand);
668 setOperationAction(ISD::UREM, VT, Expand);
669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
670 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
671 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
672 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
673 setOperationAction(ISD::FMA, VT, Expand);
674 setOperationAction(ISD::FFLOOR, VT, Expand);
675 setOperationAction(ISD::FCEIL, VT, Expand);
676 setOperationAction(ISD::FTRUNC, VT, Expand);
677 setOperationAction(ISD::FRINT, VT, Expand);
678 setOperationAction(ISD::FNEARBYINT, VT, Expand);
679 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
680 setOperationAction(ISD::MULHS, VT, Expand);
681 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
682 setOperationAction(ISD::MULHU, VT, Expand);
683 setOperationAction(ISD::SDIVREM, VT, Expand);
684 setOperationAction(ISD::UDIVREM, VT, Expand);
685 setOperationAction(ISD::CTPOP, VT, Expand);
686 setOperationAction(ISD::CTTZ, VT, Expand);
687 setOperationAction(ISD::CTLZ, VT, Expand);
688 setOperationAction(ISD::ROTL, VT, Expand);
689 setOperationAction(ISD::ROTR, VT, Expand);
690 setOperationAction(ISD::BSWAP, VT, Expand);
691 setOperationAction(ISD::SETCC, VT, Expand);
692 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
693 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
694 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
695 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
696 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
697 setOperationAction(ISD::TRUNCATE, VT, Expand);
698 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
699 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
700 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
701 setOperationAction(ISD::SELECT_CC, VT, Expand);
702 for (MVT InnerVT : MVT::vector_valuetypes()) {
703 setTruncStoreAction(InnerVT, VT, Expand);
705 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
706 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
708 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
709 // types, we have to deal with them whether we ask for Expansion or not.
710 // Setting Expand causes its own optimisation problems though, so leave
712 if (VT.getVectorElementType() == MVT::i1)
713 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
715 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
716 // split/scalarized right now.
717 if (VT.getVectorElementType() == MVT::f16)
718 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
722 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
723 // with -msoft-float, disable use of MMX as well.
724 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
725 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
726 // No operations on x86mmx supported, everything uses intrinsics.
729 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
733 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
734 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
735 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
736 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
738 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
740 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
741 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
744 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
745 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
746 : &X86::VR128RegClass);
748 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
749 // registers cannot be used even for integer operations.
750 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
752 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
754 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
756 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
757 : &X86::VR128RegClass);
759 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
760 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
761 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
762 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
763 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
764 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
765 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
766 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
767 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
768 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
769 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
770 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
771 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
773 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
774 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
775 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
776 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
778 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
779 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
780 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
781 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
783 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
784 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
787 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
789 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
790 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
791 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
792 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
794 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
795 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
796 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
797 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
799 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
800 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
801 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
802 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
803 setOperationAction(ISD::VSELECT, VT, Custom);
804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
807 // We support custom legalizing of sext and anyext loads for specific
808 // memory vector types which we can load as a scalar (or sequence of
809 // scalars) and extend in-register to a legal 128-bit vector type. For sext
810 // loads these must work with a single scalar load.
811 for (MVT VT : MVT::integer_vector_valuetypes()) {
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
814 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
823 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
824 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
826 setOperationAction(ISD::VSELECT, VT, Custom);
828 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
831 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
832 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
835 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
836 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
837 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
841 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
844 // Custom lower v2i64 and v2f64 selects.
845 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
846 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
849 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
852 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
856 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
858 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
859 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
862 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
864 for (MVT VT : MVT::fp_vector_valuetypes())
865 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
867 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
869 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
873 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
875 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
876 setOperationAction(ISD::SRL, VT, Custom);
877 setOperationAction(ISD::SHL, VT, Custom);
878 setOperationAction(ISD::SRA, VT, Custom);
881 // In the customized shift lowering, the legal cases in AVX2 will be
883 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
884 setOperationAction(ISD::SRL, VT, Custom);
885 setOperationAction(ISD::SHL, VT, Custom);
886 setOperationAction(ISD::SRA, VT, Custom);
890 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
891 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
892 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
893 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
894 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
895 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
898 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
899 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
900 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
901 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
902 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
903 setOperationAction(ISD::FRINT, RoundedTy, Legal);
904 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
907 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
908 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
909 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
910 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
911 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
916 // FIXME: Do we need to handle scalar-to-vector here?
917 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
919 // We directly match byte blends in the backend as they match the VSELECT
921 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
923 // SSE41 brings specific instructions for doing vector sign extend even in
924 // cases where we don't have SRA.
925 for (MVT VT : MVT::integer_vector_valuetypes()) {
926 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
927 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
931 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
932 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
933 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
934 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
935 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
936 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
937 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
939 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
940 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
941 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
942 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
943 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
944 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
946 // i8 vectors are custom because the source register and source
947 // source memory operand types are not the same width.
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
951 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
952 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
953 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
954 setOperationAction(ISD::ROTL, VT, Custom);
956 // XOP can efficiently perform BITREVERSE with VPPERM.
957 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
958 setOperationAction(ISD::BITREVERSE, VT, Custom);
960 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
961 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
962 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
966 bool HasInt256 = Subtarget.hasInt256();
968 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
969 : &X86::VR256RegClass);
970 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
976 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
981 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
982 setOperationAction(ISD::FFLOOR, VT, Legal);
983 setOperationAction(ISD::FCEIL, VT, Legal);
984 setOperationAction(ISD::FTRUNC, VT, Legal);
985 setOperationAction(ISD::FRINT, VT, Legal);
986 setOperationAction(ISD::FNEARBYINT, VT, Legal);
987 setOperationAction(ISD::FNEG, VT, Custom);
988 setOperationAction(ISD::FABS, VT, Custom);
989 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
992 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
993 // even though v8i16 is a legal type.
994 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
995 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
996 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
998 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
999 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1000 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1002 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1003 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1005 for (MVT VT : MVT::fp_vector_valuetypes())
1006 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1008 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1009 setOperationAction(ISD::SRL, VT, Custom);
1010 setOperationAction(ISD::SHL, VT, Custom);
1011 setOperationAction(ISD::SRA, VT, Custom);
1014 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1015 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1016 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1017 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1020 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1021 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1023 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1024 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1025 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1026 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1027 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1028 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1029 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1031 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1032 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1035 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1037 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1038 setOperationAction(ISD::CTPOP, VT, Custom);
1039 setOperationAction(ISD::CTTZ, VT, Custom);
1040 setOperationAction(ISD::CTLZ, VT, Custom);
1043 if (Subtarget.hasAnyFMA()) {
1044 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1045 MVT::v2f64, MVT::v4f64 })
1046 setOperationAction(ISD::FMA, VT, Legal);
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1055 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1059 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1060 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1065 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1067 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1069 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1070 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1077 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1079 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1080 // when we have a 256bit-wide blend with immediate.
1081 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1083 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1084 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1085 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1086 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1087 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1088 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1089 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1091 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1092 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1093 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1094 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1095 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1096 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1099 // In the customized shift lowering, the legal cases in AVX2 will be
1101 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1102 setOperationAction(ISD::SRL, VT, Custom);
1103 setOperationAction(ISD::SHL, VT, Custom);
1104 setOperationAction(ISD::SRA, VT, Custom);
1107 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1108 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1109 setOperationAction(ISD::MLOAD, VT, Legal);
1110 setOperationAction(ISD::MSTORE, VT, Legal);
1113 // Extract subvector is special because the value type
1114 // (result) is 128-bit but the source is 256-bit wide.
1115 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1116 MVT::v4f32, MVT::v2f64 }) {
1117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1120 // Custom lower several nodes for 256-bit types.
1121 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1122 MVT::v8f32, MVT::v4f64 }) {
1123 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1124 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1125 setOperationAction(ISD::VSELECT, VT, Custom);
1126 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1127 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1128 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1129 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1130 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1134 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1136 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1137 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1138 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1139 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1140 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1141 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1142 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1146 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1147 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1148 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1149 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1150 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1152 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1153 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1154 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1156 for (MVT VT : MVT::fp_vector_valuetypes())
1157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1159 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1160 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1161 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1162 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1163 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1164 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1165 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1167 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1168 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1169 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1170 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1171 setOperationAction(ISD::XOR, MVT::i1, Legal);
1172 setOperationAction(ISD::OR, MVT::i1, Legal);
1173 setOperationAction(ISD::AND, MVT::i1, Legal);
1174 setOperationAction(ISD::SUB, MVT::i1, Custom);
1175 setOperationAction(ISD::ADD, MVT::i1, Custom);
1176 setOperationAction(ISD::MUL, MVT::i1, Custom);
1178 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1179 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1180 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1181 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1182 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1183 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1184 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1185 setTruncStoreAction(VT, MaskVT, Custom);
1188 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1189 setOperationAction(ISD::FNEG, VT, Custom);
1190 setOperationAction(ISD::FABS, VT, Custom);
1191 setOperationAction(ISD::FMA, VT, Legal);
1192 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1195 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1196 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1197 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1199 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1200 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1202 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1203 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1204 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1205 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1206 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1207 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1208 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1209 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1210 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1211 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1212 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1213 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1214 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1215 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1217 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1218 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1219 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1221 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1222 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1223 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1224 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1225 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1226 if (Subtarget.hasVLX()){
1227 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1228 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1229 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1230 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1231 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1233 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1234 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1235 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1236 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1237 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1239 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1240 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1241 setOperationAction(ISD::MLOAD, VT, Custom);
1242 setOperationAction(ISD::MSTORE, VT, Custom);
1245 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1246 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1248 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1249 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1251 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1252 if (Subtarget.hasDQI()) {
1253 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1254 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1255 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1256 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1257 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1258 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1259 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1260 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1261 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1262 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1263 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1264 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1266 if (Subtarget.hasVLX()) {
1267 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1268 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1269 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1270 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1273 if (Subtarget.hasVLX()) {
1274 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1275 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1276 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1278 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1279 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1280 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1281 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1282 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1284 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1285 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1286 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1287 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1288 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1289 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1290 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1291 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1292 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1293 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1294 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1297 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1298 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1299 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1300 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1301 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1302 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1303 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1304 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1305 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1306 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1309 if (Subtarget.hasDQI()) {
1310 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1311 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1313 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1314 setOperationAction(ISD::FFLOOR, VT, Legal);
1315 setOperationAction(ISD::FCEIL, VT, Legal);
1316 setOperationAction(ISD::FTRUNC, VT, Legal);
1317 setOperationAction(ISD::FRINT, VT, Legal);
1318 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1321 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1322 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1324 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1325 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1326 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1329 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1330 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1331 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1332 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1334 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1335 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1337 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1339 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1340 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1341 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1342 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1343 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1344 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1345 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1346 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1347 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1348 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1349 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1350 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1352 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1353 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1354 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1355 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1356 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1357 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1358 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1359 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1361 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1362 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1363 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1364 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1365 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1366 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1368 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1370 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1371 setOperationAction(ISD::SRL, VT, Custom);
1372 setOperationAction(ISD::SHL, VT, Custom);
1373 setOperationAction(ISD::SRA, VT, Custom);
1374 setOperationAction(ISD::CTPOP, VT, Custom);
1375 setOperationAction(ISD::CTTZ, VT, Custom);
1378 // Need to promote to 64-bit even though we have 32-bit masked instructions
1379 // because the IR optimizers rearrange bitcasts around logic ops leaving
1380 // too many variations to handle if we don't promote them.
1381 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1382 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1383 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1385 if (Subtarget.hasCDI()) {
1386 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1387 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1389 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1390 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1391 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1392 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1394 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1395 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1397 if (Subtarget.hasVLX()) {
1398 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1399 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1400 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1401 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1403 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1404 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1405 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1406 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1409 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1410 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1411 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1412 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1413 } // Subtarget.hasCDI()
1415 if (Subtarget.hasDQI()) {
1416 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1417 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1418 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1419 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1422 // Custom lower several nodes.
1423 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1424 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1425 setOperationAction(ISD::MGATHER, VT, Custom);
1426 setOperationAction(ISD::MSCATTER, VT, Custom);
1428 // Extract subvector is special because the value type
1429 // (result) is 256-bit but the source is 512-bit wide.
1430 // 128-bit was made Custom under AVX1.
1431 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1432 MVT::v8f32, MVT::v4f64 })
1433 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1434 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1435 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1436 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1438 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1439 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1440 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1441 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1442 setOperationAction(ISD::VSELECT, VT, Legal);
1443 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1444 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1445 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1446 setOperationAction(ISD::MLOAD, VT, Legal);
1447 setOperationAction(ISD::MSTORE, VT, Legal);
1448 setOperationAction(ISD::MGATHER, VT, Legal);
1449 setOperationAction(ISD::MSCATTER, VT, Custom);
1451 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1452 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1453 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1457 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1458 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1459 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1461 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1462 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1464 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1465 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1466 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1467 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1468 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1469 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1471 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1472 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1473 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1474 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1475 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1476 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1477 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1478 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1480 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1481 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1482 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1483 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1484 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1485 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1486 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1487 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1488 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1489 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1490 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1491 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1492 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1493 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1494 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1495 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1496 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1497 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1498 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1499 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1500 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1501 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1502 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1503 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1505 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1506 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1507 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1508 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1509 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1510 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1511 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1512 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1513 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1514 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1515 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1516 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1517 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1519 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1520 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1521 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1522 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1523 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1524 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1525 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1526 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1528 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1530 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1531 if (Subtarget.hasVLX()) {
1532 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1533 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1536 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1537 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1538 setOperationAction(ISD::MLOAD, VT, Action);
1539 setOperationAction(ISD::MSTORE, VT, Action);
1542 if (Subtarget.hasCDI()) {
1543 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1544 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1547 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1548 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1549 setOperationAction(ISD::VSELECT, VT, Legal);
1550 setOperationAction(ISD::SRL, VT, Custom);
1551 setOperationAction(ISD::SHL, VT, Custom);
1552 setOperationAction(ISD::SRA, VT, Custom);
1553 setOperationAction(ISD::MLOAD, VT, Legal);
1554 setOperationAction(ISD::MSTORE, VT, Legal);
1555 setOperationAction(ISD::CTPOP, VT, Custom);
1556 setOperationAction(ISD::CTTZ, VT, Custom);
1558 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1559 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1560 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1563 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1564 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1565 if (Subtarget.hasVLX()) {
1566 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1567 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1568 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1573 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1574 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1575 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1577 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1578 setOperationAction(ISD::ADD, VT, Expand);
1579 setOperationAction(ISD::SUB, VT, Expand);
1580 setOperationAction(ISD::MUL, VT, Expand);
1581 setOperationAction(ISD::VSELECT, VT, Expand);
1583 setOperationAction(ISD::TRUNCATE, VT, Custom);
1584 setOperationAction(ISD::SETCC, VT, Custom);
1585 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1587 setOperationAction(ISD::SELECT, VT, Custom);
1588 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1589 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1592 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1593 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1594 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1595 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1597 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1598 setOperationAction(ISD::SMAX, VT, Legal);
1599 setOperationAction(ISD::UMAX, VT, Legal);
1600 setOperationAction(ISD::SMIN, VT, Legal);
1601 setOperationAction(ISD::UMIN, VT, Legal);
1605 // We want to custom lower some of our intrinsics.
1606 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1607 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1608 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1609 if (!Subtarget.is64Bit()) {
1610 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1611 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1614 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1615 // handle type legalization for these operations here.
1617 // FIXME: We really should do custom legalization for addition and
1618 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1619 // than generic legalization for 64-bit multiplication-with-overflow, though.
1620 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1621 if (VT == MVT::i64 && !Subtarget.is64Bit())
1623 // Add/Sub/Mul with overflow operations are custom lowered.
1624 setOperationAction(ISD::SADDO, VT, Custom);
1625 setOperationAction(ISD::UADDO, VT, Custom);
1626 setOperationAction(ISD::SSUBO, VT, Custom);
1627 setOperationAction(ISD::USUBO, VT, Custom);
1628 setOperationAction(ISD::SMULO, VT, Custom);
1629 setOperationAction(ISD::UMULO, VT, Custom);
1632 if (!Subtarget.is64Bit()) {
1633 // These libcalls are not available in 32-bit.
1634 setLibcallName(RTLIB::SHL_I128, nullptr);
1635 setLibcallName(RTLIB::SRL_I128, nullptr);
1636 setLibcallName(RTLIB::SRA_I128, nullptr);
1639 // Combine sin / cos into one node or libcall if possible.
1640 if (Subtarget.hasSinCos()) {
1641 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1642 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1643 if (Subtarget.isTargetDarwin()) {
1644 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1645 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1646 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1647 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1651 if (Subtarget.isTargetWin64()) {
1652 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1653 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1654 setOperationAction(ISD::SREM, MVT::i128, Custom);
1655 setOperationAction(ISD::UREM, MVT::i128, Custom);
1656 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1657 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1660 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1661 // is. We should promote the value to 64-bits to solve this.
1662 // This is what the CRT headers do - `fmodf` is an inline header
1663 // function casting to f64 and calling `fmod`.
1664 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1665 Subtarget.isTargetWindowsItanium()))
1666 for (ISD::NodeType Op :
1667 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1668 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1669 if (isOperationExpand(Op, MVT::f32))
1670 setOperationAction(Op, MVT::f32, Promote);
1672 // We have target-specific dag combine patterns for the following nodes:
1673 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1674 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1675 setTargetDAGCombine(ISD::BITCAST);
1676 setTargetDAGCombine(ISD::VSELECT);
1677 setTargetDAGCombine(ISD::SELECT);
1678 setTargetDAGCombine(ISD::SHL);
1679 setTargetDAGCombine(ISD::SRA);
1680 setTargetDAGCombine(ISD::SRL);
1681 setTargetDAGCombine(ISD::OR);
1682 setTargetDAGCombine(ISD::AND);
1683 setTargetDAGCombine(ISD::ADD);
1684 setTargetDAGCombine(ISD::FADD);
1685 setTargetDAGCombine(ISD::FSUB);
1686 setTargetDAGCombine(ISD::FNEG);
1687 setTargetDAGCombine(ISD::FMA);
1688 setTargetDAGCombine(ISD::FMINNUM);
1689 setTargetDAGCombine(ISD::FMAXNUM);
1690 setTargetDAGCombine(ISD::SUB);
1691 setTargetDAGCombine(ISD::LOAD);
1692 setTargetDAGCombine(ISD::MLOAD);
1693 setTargetDAGCombine(ISD::STORE);
1694 setTargetDAGCombine(ISD::MSTORE);
1695 setTargetDAGCombine(ISD::TRUNCATE);
1696 setTargetDAGCombine(ISD::ZERO_EXTEND);
1697 setTargetDAGCombine(ISD::ANY_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND);
1699 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1700 setTargetDAGCombine(ISD::SINT_TO_FP);
1701 setTargetDAGCombine(ISD::UINT_TO_FP);
1702 setTargetDAGCombine(ISD::SETCC);
1703 setTargetDAGCombine(ISD::MUL);
1704 setTargetDAGCombine(ISD::XOR);
1705 setTargetDAGCombine(ISD::MSCATTER);
1706 setTargetDAGCombine(ISD::MGATHER);
1708 computeRegisterProperties(Subtarget.getRegisterInfo());
1710 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1711 MaxStoresPerMemsetOptSize = 8;
1712 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1713 MaxStoresPerMemcpyOptSize = 4;
1714 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1715 MaxStoresPerMemmoveOptSize = 4;
1716 setPrefLoopAlignment(4); // 2^4 bytes.
1718 // An out-of-order CPU can speculatively execute past a predictable branch,
1719 // but a conditional move could be stalled by an expensive earlier operation.
1720 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1721 EnableExtLdPromotion = true;
1722 setPrefFunctionAlignment(4); // 2^4 bytes.
1724 verifyIntrinsicTables();
1727 // This has so far only been implemented for 64-bit MachO.
1728 bool X86TargetLowering::useLoadStackGuardNode() const {
1729 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1732 TargetLoweringBase::LegalizeTypeAction
1733 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1734 if (ExperimentalVectorWideningLegalization &&
1735 VT.getVectorNumElements() != 1 &&
1736 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1737 return TypeWidenVector;
1739 return TargetLoweringBase::getPreferredVectorAction(VT);
1742 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1743 LLVMContext& Context,
1746 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1748 if (VT.isSimple()) {
1749 MVT VVT = VT.getSimpleVT();
1750 const unsigned NumElts = VVT.getVectorNumElements();
1751 MVT EltVT = VVT.getVectorElementType();
1752 if (VVT.is512BitVector()) {
1753 if (Subtarget.hasAVX512())
1754 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1755 EltVT == MVT::f32 || EltVT == MVT::f64)
1757 case 8: return MVT::v8i1;
1758 case 16: return MVT::v16i1;
1760 if (Subtarget.hasBWI())
1761 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1763 case 32: return MVT::v32i1;
1764 case 64: return MVT::v64i1;
1768 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1769 return MVT::getVectorVT(MVT::i1, NumElts);
1771 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1772 EVT LegalVT = getTypeToTransformTo(Context, VT);
1773 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1776 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1778 case 2: return MVT::v2i1;
1779 case 4: return MVT::v4i1;
1780 case 8: return MVT::v8i1;
1784 return VT.changeVectorElementTypeToInteger();
1787 /// Helper for getByValTypeAlignment to determine
1788 /// the desired ByVal argument alignment.
1789 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1792 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1793 if (VTy->getBitWidth() == 128)
1795 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1796 unsigned EltAlign = 0;
1797 getMaxByValAlign(ATy->getElementType(), EltAlign);
1798 if (EltAlign > MaxAlign)
1799 MaxAlign = EltAlign;
1800 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1801 for (auto *EltTy : STy->elements()) {
1802 unsigned EltAlign = 0;
1803 getMaxByValAlign(EltTy, EltAlign);
1804 if (EltAlign > MaxAlign)
1805 MaxAlign = EltAlign;
1812 /// Return the desired alignment for ByVal aggregate
1813 /// function arguments in the caller parameter area. For X86, aggregates
1814 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1815 /// are at 4-byte boundaries.
1816 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1817 const DataLayout &DL) const {
1818 if (Subtarget.is64Bit()) {
1819 // Max of 8 and alignment of type.
1820 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1827 if (Subtarget.hasSSE1())
1828 getMaxByValAlign(Ty, Align);
1832 /// Returns the target specific optimal type for load
1833 /// and store operations as a result of memset, memcpy, and memmove
1834 /// lowering. If DstAlign is zero that means it's safe to destination
1835 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1836 /// means there isn't a need to check it against alignment requirement,
1837 /// probably because the source does not need to be loaded. If 'IsMemset' is
1838 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1839 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1840 /// source is constant so it does not need to be loaded.
1841 /// It returns EVT::Other if the type should be determined using generic
1842 /// target-independent logic.
1844 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1845 unsigned DstAlign, unsigned SrcAlign,
1846 bool IsMemset, bool ZeroMemset,
1848 MachineFunction &MF) const {
1849 const Function *F = MF.getFunction();
1850 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1852 (!Subtarget.isUnalignedMem16Slow() ||
1853 ((DstAlign == 0 || DstAlign >= 16) &&
1854 (SrcAlign == 0 || SrcAlign >= 16)))) {
1855 // FIXME: Check if unaligned 32-byte accesses are slow.
1856 if (Size >= 32 && Subtarget.hasAVX()) {
1857 // Although this isn't a well-supported type for AVX1, we'll let
1858 // legalization and shuffle lowering produce the optimal codegen. If we
1859 // choose an optimal type with a vector element larger than a byte,
1860 // getMemsetStores() may create an intermediate splat (using an integer
1861 // multiply) before we splat as a vector.
1864 if (Subtarget.hasSSE2())
1866 // TODO: Can SSE1 handle a byte vector?
1867 if (Subtarget.hasSSE1())
1869 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1870 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1871 // Do not use f64 to lower memcpy if source is string constant. It's
1872 // better to use i32 to avoid the loads.
1873 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1874 // The gymnastics of splatting a byte value into an XMM register and then
1875 // only using 8-byte stores (because this is a CPU with slow unaligned
1876 // 16-byte accesses) makes that a loser.
1880 // This is a compromise. If we reach here, unaligned accesses may be slow on
1881 // this target. However, creating smaller, aligned accesses could be even
1882 // slower and would certainly be a lot more code.
1883 if (Subtarget.is64Bit() && Size >= 8)
1888 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1890 return X86ScalarSSEf32;
1891 else if (VT == MVT::f64)
1892 return X86ScalarSSEf64;
1897 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1902 switch (VT.getSizeInBits()) {
1904 // 8-byte and under are always assumed to be fast.
1908 *Fast = !Subtarget.isUnalignedMem16Slow();
1911 *Fast = !Subtarget.isUnalignedMem32Slow();
1913 // TODO: What about AVX-512 (512-bit) accesses?
1916 // Misaligned accesses of any size are always allowed.
1920 /// Return the entry encoding for a jump table in the
1921 /// current function. The returned value is a member of the
1922 /// MachineJumpTableInfo::JTEntryKind enum.
1923 unsigned X86TargetLowering::getJumpTableEncoding() const {
1924 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1926 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1927 return MachineJumpTableInfo::EK_Custom32;
1929 // Otherwise, use the normal jump table encoding heuristics.
1930 return TargetLowering::getJumpTableEncoding();
1933 bool X86TargetLowering::useSoftFloat() const {
1934 return Subtarget.useSoftFloat();
1938 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1939 const MachineBasicBlock *MBB,
1940 unsigned uid,MCContext &Ctx) const{
1941 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1942 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1944 return MCSymbolRefExpr::create(MBB->getSymbol(),
1945 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1948 /// Returns relocation base for the given PIC jumptable.
1949 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1950 SelectionDAG &DAG) const {
1951 if (!Subtarget.is64Bit())
1952 // This doesn't have SDLoc associated with it, but is not really the
1953 // same as a Register.
1954 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1955 getPointerTy(DAG.getDataLayout()));
1959 /// This returns the relocation base for the given PIC jumptable,
1960 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1961 const MCExpr *X86TargetLowering::
1962 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1963 MCContext &Ctx) const {
1964 // X86-64 uses RIP relative addressing based on the jump table label.
1965 if (Subtarget.isPICStyleRIPRel())
1966 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1968 // Otherwise, the reference is relative to the PIC base.
1969 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1972 std::pair<const TargetRegisterClass *, uint8_t>
1973 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1975 const TargetRegisterClass *RRC = nullptr;
1977 switch (VT.SimpleTy) {
1979 return TargetLowering::findRepresentativeClass(TRI, VT);
1980 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1981 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1984 RRC = &X86::VR64RegClass;
1986 case MVT::f32: case MVT::f64:
1987 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1988 case MVT::v4f32: case MVT::v2f64:
1989 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1990 case MVT::v8f32: case MVT::v4f64:
1991 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1992 case MVT::v16f32: case MVT::v8f64:
1993 RRC = &X86::VR128XRegClass;
1996 return std::make_pair(RRC, Cost);
1999 unsigned X86TargetLowering::getAddressSpace() const {
2000 if (Subtarget.is64Bit())
2001 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2005 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2006 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
2007 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
2008 if (!Subtarget.isTargetGlibc())
2009 return TargetLowering::getIRStackGuard(IRB);
2011 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
2013 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2014 unsigned AddressSpace = getAddressSpace();
2015 return ConstantExpr::getIntToPtr(
2016 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2017 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2020 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2021 // MSVC CRT provides functionalities for stack protection.
2022 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2023 // MSVC CRT has a global variable holding security cookie.
2024 M.getOrInsertGlobal("__security_cookie",
2025 Type::getInt8PtrTy(M.getContext()));
2027 // MSVC CRT has a function to validate security cookie.
2028 auto *SecurityCheckCookie = cast<Function>(
2029 M.getOrInsertFunction("__security_check_cookie",
2030 Type::getVoidTy(M.getContext()),
2031 Type::getInt8PtrTy(M.getContext()), nullptr));
2032 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2033 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2036 // glibc has a special slot for the stack guard.
2037 if (Subtarget.isTargetGlibc())
2039 TargetLowering::insertSSPDeclarations(M);
2042 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2043 // MSVC CRT has a global variable holding security cookie.
2044 if (Subtarget.getTargetTriple().isOSMSVCRT())
2045 return M.getGlobalVariable("__security_cookie");
2046 return TargetLowering::getSDagStackGuard(M);
2049 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2050 // MSVC CRT has a function to validate security cookie.
2051 if (Subtarget.getTargetTriple().isOSMSVCRT())
2052 return M.getFunction("__security_check_cookie");
2053 return TargetLowering::getSSPStackGuardCheck(M);
2056 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2057 if (Subtarget.getTargetTriple().isOSContiki())
2058 return getDefaultSafeStackPointerLocation(IRB, false);
2060 if (!Subtarget.isTargetAndroid())
2061 return TargetLowering::getSafeStackPointerLocation(IRB);
2063 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2064 // definition of TLS_SLOT_SAFESTACK in
2065 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2066 unsigned AddressSpace, Offset;
2068 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2070 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2071 AddressSpace = getAddressSpace();
2072 return ConstantExpr::getIntToPtr(
2073 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2074 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2077 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2078 unsigned DestAS) const {
2079 assert(SrcAS != DestAS && "Expected different address spaces!");
2081 return SrcAS < 256 && DestAS < 256;
2084 //===----------------------------------------------------------------------===//
2085 // Return Value Calling Convention Implementation
2086 //===----------------------------------------------------------------------===//
2088 #include "X86GenCallingConv.inc"
2090 bool X86TargetLowering::CanLowerReturn(
2091 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2092 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2093 SmallVector<CCValAssign, 16> RVLocs;
2094 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2095 return CCInfo.CheckReturn(Outs, RetCC_X86);
2098 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2099 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2103 /// Lowers masks values (v*i1) to the local register values
2104 /// \returns DAG node after lowering to register type
2105 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2106 const SDLoc &Dl, SelectionDAG &DAG) {
2107 EVT ValVT = ValArg.getValueType();
2109 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2110 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2111 // Two stage lowering might be required
2112 // bitcast: v8i1 -> i8 / v16i1 -> i16
2113 // anyextend: i8 -> i32 / i16 -> i32
2114 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2115 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2116 if (ValLoc == MVT::i32)
2117 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2119 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2120 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2121 // One stage lowering is required
2122 // bitcast: v32i1 -> i32 / v64i1 -> i64
2123 return DAG.getBitcast(ValLoc, ValArg);
2125 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2128 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2129 static void Passv64i1ArgInRegs(
2130 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2131 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2132 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2133 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2134 "Expected AVX512BW or AVX512BMI target!");
2135 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2136 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2137 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2138 "The value should reside in two registers");
2140 // Before splitting the value we cast it to i64
2141 Arg = DAG.getBitcast(MVT::i64, Arg);
2143 // Splitting the value into two i32 types
2145 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2146 DAG.getConstant(0, Dl, MVT::i32));
2147 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2148 DAG.getConstant(1, Dl, MVT::i32));
2150 // Attach the two i32 types into corresponding registers
2151 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2152 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2156 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2158 const SmallVectorImpl<ISD::OutputArg> &Outs,
2159 const SmallVectorImpl<SDValue> &OutVals,
2160 const SDLoc &dl, SelectionDAG &DAG) const {
2161 MachineFunction &MF = DAG.getMachineFunction();
2162 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2164 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2165 report_fatal_error("X86 interrupts may not return any value");
2167 SmallVector<CCValAssign, 16> RVLocs;
2168 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2169 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2172 SmallVector<SDValue, 6> RetOps;
2173 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2174 // Operand #1 = Bytes To Pop
2175 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2178 // Copy the result values into the output registers.
2179 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2181 CCValAssign &VA = RVLocs[I];
2182 assert(VA.isRegLoc() && "Can only return in registers!");
2183 SDValue ValToCopy = OutVals[OutsIndex];
2184 EVT ValVT = ValToCopy.getValueType();
2186 // Promote values to the appropriate types.
2187 if (VA.getLocInfo() == CCValAssign::SExt)
2188 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2189 else if (VA.getLocInfo() == CCValAssign::ZExt)
2190 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2191 else if (VA.getLocInfo() == CCValAssign::AExt) {
2192 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2193 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2195 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2197 else if (VA.getLocInfo() == CCValAssign::BCvt)
2198 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2200 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2201 "Unexpected FP-extend for return value.");
2203 // If this is x86-64, and we disabled SSE, we can't return FP values,
2204 // or SSE or MMX vectors.
2205 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2206 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2207 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2208 report_fatal_error("SSE register return with SSE disabled");
2210 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2211 // llvm-gcc has never done it right and no one has noticed, so this
2212 // should be OK for now.
2213 if (ValVT == MVT::f64 &&
2214 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2215 report_fatal_error("SSE2 register return with SSE2 disabled");
2217 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2218 // the RET instruction and handled by the FP Stackifier.
2219 if (VA.getLocReg() == X86::FP0 ||
2220 VA.getLocReg() == X86::FP1) {
2221 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2222 // change the value to the FP stack register class.
2223 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2224 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2225 RetOps.push_back(ValToCopy);
2226 // Don't emit a copytoreg.
2230 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2231 // which is returned in RAX / RDX.
2232 if (Subtarget.is64Bit()) {
2233 if (ValVT == MVT::x86mmx) {
2234 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2235 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2236 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2238 // If we don't have SSE2 available, convert to v4f32 so the generated
2239 // register is legal.
2240 if (!Subtarget.hasSSE2())
2241 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2246 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2248 if (VA.needsCustom()) {
2249 assert(VA.getValVT() == MVT::v64i1 &&
2250 "Currently the only custom case is when we split v64i1 to 2 regs");
2252 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2255 assert(2 == RegsToPass.size() &&
2256 "Expecting two registers after Pass64BitArgInRegs");
2258 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2261 // Add nodes to the DAG and add the values into the RetOps list
2262 for (auto &Reg : RegsToPass) {
2263 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2264 Flag = Chain.getValue(1);
2265 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2269 // Swift calling convention does not require we copy the sret argument
2270 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2272 // All x86 ABIs require that for returning structs by value we copy
2273 // the sret argument into %rax/%eax (depending on ABI) for the return.
2274 // We saved the argument into a virtual register in the entry block,
2275 // so now we copy the value out and into %rax/%eax.
2277 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2278 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2279 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2280 // either case FuncInfo->setSRetReturnReg() will have been called.
2281 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2282 // When we have both sret and another return value, we should use the
2283 // original Chain stored in RetOps[0], instead of the current Chain updated
2284 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2286 // For the case of sret and another return value, we have
2287 // Chain_0 at the function entry
2288 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2289 // If we use Chain_1 in getCopyFromReg, we will have
2290 // Val = getCopyFromReg(Chain_1)
2291 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2293 // getCopyToReg(Chain_0) will be glued together with
2294 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2295 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2296 // Data dependency from Unit B to Unit A due to usage of Val in
2297 // getCopyToReg(Chain_1, Val)
2298 // Chain dependency from Unit A to Unit B
2300 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2301 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2302 getPointerTy(MF.getDataLayout()));
2305 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2306 X86::RAX : X86::EAX;
2307 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2308 Flag = Chain.getValue(1);
2310 // RAX/EAX now acts like a return value.
2312 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2315 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2316 const MCPhysReg *I =
2317 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2320 if (X86::GR64RegClass.contains(*I))
2321 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2323 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2327 RetOps[0] = Chain; // Update chain.
2329 // Add the flag if we have it.
2331 RetOps.push_back(Flag);
2333 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2334 if (CallConv == CallingConv::X86_INTR)
2335 opcode = X86ISD::IRET;
2336 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2339 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2340 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2343 SDValue TCChain = Chain;
2344 SDNode *Copy = *N->use_begin();
2345 if (Copy->getOpcode() == ISD::CopyToReg) {
2346 // If the copy has a glue operand, we conservatively assume it isn't safe to
2347 // perform a tail call.
2348 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2350 TCChain = Copy->getOperand(0);
2351 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2354 bool HasRet = false;
2355 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2357 if (UI->getOpcode() != X86ISD::RET_FLAG)
2359 // If we are returning more than one value, we can definitely
2360 // not make a tail call see PR19530
2361 if (UI->getNumOperands() > 4)
2363 if (UI->getNumOperands() == 4 &&
2364 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2376 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2377 ISD::NodeType ExtendKind) const {
2378 MVT ReturnMVT = MVT::i32;
2380 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2381 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2382 // The ABI does not require i1, i8 or i16 to be extended.
2384 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2385 // always extending i8/i16 return values, so keep doing that for now.
2387 ReturnMVT = MVT::i8;
2390 EVT MinVT = getRegisterType(Context, ReturnMVT);
2391 return VT.bitsLT(MinVT) ? MinVT : VT;
2394 /// Reads two 32 bit registers and creates a 64 bit mask value.
2395 /// \param VA The current 32 bit value that need to be assigned.
2396 /// \param NextVA The next 32 bit value that need to be assigned.
2397 /// \param Root The parent DAG node.
2398 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2399 /// glue purposes. In the case the DAG is already using
2400 /// physical register instead of virtual, we should glue
2401 /// our new SDValue to InFlag SDvalue.
2402 /// \return a new SDvalue of size 64bit.
2403 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2404 SDValue &Root, SelectionDAG &DAG,
2405 const SDLoc &Dl, const X86Subtarget &Subtarget,
2406 SDValue *InFlag = nullptr) {
2407 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2408 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2409 assert(VA.getValVT() == MVT::v64i1 &&
2410 "Expecting first location of 64 bit width type");
2411 assert(NextVA.getValVT() == VA.getValVT() &&
2412 "The locations should have the same type");
2413 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2414 "The values should reside in two registers");
2418 SDValue ArgValueLo, ArgValueHi;
2420 MachineFunction &MF = DAG.getMachineFunction();
2421 const TargetRegisterClass *RC = &X86::GR32RegClass;
2423 // Read a 32 bit value from the registers
2424 if (nullptr == InFlag) {
2425 // When no physical register is present,
2426 // create an intermediate virtual register
2427 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2428 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2429 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2430 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2432 // When a physical register is available read the value from it and glue
2433 // the reads together.
2435 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2436 *InFlag = ArgValueLo.getValue(2);
2438 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2439 *InFlag = ArgValueHi.getValue(2);
2442 // Convert the i32 type into v32i1 type
2443 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2445 // Convert the i32 type into v32i1 type
2446 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2448 // Concantenate the two values together
2449 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2452 /// The function will lower a register of various sizes (8/16/32/64)
2453 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2454 /// \returns a DAG node contains the operand after lowering to mask type.
2455 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2456 const EVT &ValLoc, const SDLoc &Dl,
2457 SelectionDAG &DAG) {
2458 SDValue ValReturned = ValArg;
2460 if (ValVT == MVT::v64i1) {
2461 // In 32 bit machine, this case is handled by getv64i1Argument
2462 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2463 // In 64 bit machine, There is no need to truncate the value only bitcast
2466 switch (ValVT.getSimpleVT().SimpleTy) {
2477 llvm_unreachable("Expecting a vector of i1 types");
2480 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2483 return DAG.getBitcast(ValVT, ValReturned);
2486 /// Lower the result values of a call into the
2487 /// appropriate copies out of appropriate physical registers.
2489 SDValue X86TargetLowering::LowerCallResult(
2490 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2491 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2492 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2494 // Assign locations to each value returned by this call.
2495 SmallVector<CCValAssign, 16> RVLocs;
2496 bool Is64Bit = Subtarget.is64Bit();
2497 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2499 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2501 // Copy all of the result registers out of their specified physreg.
2502 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2504 CCValAssign &VA = RVLocs[I];
2505 EVT CopyVT = VA.getLocVT();
2507 // If this is x86-64, and we disabled SSE, we can't return FP values
2508 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2509 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2510 report_fatal_error("SSE register return with SSE disabled");
2513 // If we prefer to use the value in xmm registers, copy it out as f80 and
2514 // use a truncate to move it from fp stack reg to xmm reg.
2515 bool RoundAfterCopy = false;
2516 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2517 isScalarFPTypeInSSEReg(VA.getValVT())) {
2518 if (!Subtarget.hasX87())
2519 report_fatal_error("X87 register return with X87 disabled");
2521 RoundAfterCopy = (CopyVT != VA.getLocVT());
2525 if (VA.needsCustom()) {
2526 assert(VA.getValVT() == MVT::v64i1 &&
2527 "Currently the only custom case is when we split v64i1 to 2 regs");
2529 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2531 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2533 Val = Chain.getValue(0);
2534 InFlag = Chain.getValue(2);
2538 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2539 // This truncation won't change the value.
2540 DAG.getIntPtrConstant(1, dl));
2542 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2543 if (VA.getValVT().isVector() &&
2544 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2545 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2546 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2547 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2549 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2552 InVals.push_back(Val);
2558 //===----------------------------------------------------------------------===//
2559 // C & StdCall & Fast Calling Convention implementation
2560 //===----------------------------------------------------------------------===//
2561 // StdCall calling convention seems to be standard for many Windows' API
2562 // routines and around. It differs from C calling convention just a little:
2563 // callee should clean up the stack, not caller. Symbols should be also
2564 // decorated in some fancy way :) It doesn't support any vector arguments.
2565 // For info on fast calling convention see Fast Calling Convention (tail call)
2566 // implementation LowerX86_32FastCCCallTo.
2568 /// CallIsStructReturn - Determines whether a call uses struct return
2570 enum StructReturnType {
2575 static StructReturnType
2576 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2578 return NotStructReturn;
2580 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2581 if (!Flags.isSRet())
2582 return NotStructReturn;
2583 if (Flags.isInReg() || IsMCU)
2584 return RegStructReturn;
2585 return StackStructReturn;
2588 /// Determines whether a function uses struct return semantics.
2589 static StructReturnType
2590 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2592 return NotStructReturn;
2594 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2595 if (!Flags.isSRet())
2596 return NotStructReturn;
2597 if (Flags.isInReg() || IsMCU)
2598 return RegStructReturn;
2599 return StackStructReturn;
2602 /// Make a copy of an aggregate at address specified by "Src" to address
2603 /// "Dst" with size and alignment information specified by the specific
2604 /// parameter attribute. The copy will be passed as a byval function parameter.
2605 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2606 SDValue Chain, ISD::ArgFlagsTy Flags,
2607 SelectionDAG &DAG, const SDLoc &dl) {
2608 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2610 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2611 /*isVolatile*/false, /*AlwaysInline=*/true,
2612 /*isTailCall*/false,
2613 MachinePointerInfo(), MachinePointerInfo());
2616 /// Return true if the calling convention is one that we can guarantee TCO for.
2617 static bool canGuaranteeTCO(CallingConv::ID CC) {
2618 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2619 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2620 CC == CallingConv::HHVM);
2623 /// Return true if we might ever do TCO for calls with this calling convention.
2624 static bool mayTailCallThisCC(CallingConv::ID CC) {
2626 // C calling conventions:
2627 case CallingConv::C:
2628 case CallingConv::X86_64_Win64:
2629 case CallingConv::X86_64_SysV:
2630 // Callee pop conventions:
2631 case CallingConv::X86_ThisCall:
2632 case CallingConv::X86_StdCall:
2633 case CallingConv::X86_VectorCall:
2634 case CallingConv::X86_FastCall:
2637 return canGuaranteeTCO(CC);
2641 /// Return true if the function is being made into a tailcall target by
2642 /// changing its ABI.
2643 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2644 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2647 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2649 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2650 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2654 CallingConv::ID CalleeCC = CS.getCallingConv();
2655 if (!mayTailCallThisCC(CalleeCC))
2662 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2663 const SmallVectorImpl<ISD::InputArg> &Ins,
2664 const SDLoc &dl, SelectionDAG &DAG,
2665 const CCValAssign &VA,
2666 MachineFrameInfo &MFI, unsigned i) const {
2667 // Create the nodes corresponding to a load from this parameter slot.
2668 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2669 bool AlwaysUseMutable = shouldGuaranteeTCO(
2670 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2671 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2674 // If value is passed by pointer we have address passed instead of the value
2675 // itself. No need to extend if the mask value and location share the same
2677 bool ExtendedInMem =
2678 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2679 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2681 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2682 ValVT = VA.getLocVT();
2684 ValVT = VA.getValVT();
2686 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2687 // taken by a return address.
2689 if (CallConv == CallingConv::X86_INTR) {
2690 const X86Subtarget& Subtarget =
2691 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2692 // X86 interrupts may take one or two arguments.
2693 // On the stack there will be no return address as in regular call.
2694 // Offset of last argument need to be set to -4/-8 bytes.
2695 // Where offset of the first argument out of two, should be set to 0 bytes.
2696 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2699 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2700 // changed with more analysis.
2701 // In case of tail call optimization mark all arguments mutable. Since they
2702 // could be overwritten by lowering of arguments in case of a tail call.
2703 if (Flags.isByVal()) {
2704 unsigned Bytes = Flags.getByValSize();
2705 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2706 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2707 // Adjust SP offset of interrupt parameter.
2708 if (CallConv == CallingConv::X86_INTR) {
2709 MFI.setObjectOffset(FI, Offset);
2711 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2713 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
2714 VA.getLocMemOffset(), isImmutable);
2716 // Set SExt or ZExt flag.
2717 if (VA.getLocInfo() == CCValAssign::ZExt) {
2718 MFI.setObjectZExt(FI, true);
2719 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2720 MFI.setObjectSExt(FI, true);
2723 // Adjust SP offset of interrupt parameter.
2724 if (CallConv == CallingConv::X86_INTR) {
2725 MFI.setObjectOffset(FI, Offset);
2728 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2729 SDValue Val = DAG.getLoad(
2730 ValVT, dl, Chain, FIN,
2731 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2732 return ExtendedInMem ?
2733 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2737 // FIXME: Get this from tablegen.
2738 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2739 const X86Subtarget &Subtarget) {
2740 assert(Subtarget.is64Bit());
2742 if (Subtarget.isCallingConvWin64(CallConv)) {
2743 static const MCPhysReg GPR64ArgRegsWin64[] = {
2744 X86::RCX, X86::RDX, X86::R8, X86::R9
2746 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2749 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2750 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2752 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2755 // FIXME: Get this from tablegen.
2756 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2757 CallingConv::ID CallConv,
2758 const X86Subtarget &Subtarget) {
2759 assert(Subtarget.is64Bit());
2760 if (Subtarget.isCallingConvWin64(CallConv)) {
2761 // The XMM registers which might contain var arg parameters are shadowed
2762 // in their paired GPR. So we only need to save the GPR to their home
2764 // TODO: __vectorcall will change this.
2768 const Function *Fn = MF.getFunction();
2769 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2770 bool isSoftFloat = Subtarget.useSoftFloat();
2771 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2772 "SSE register cannot be used when SSE is disabled!");
2773 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2774 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2778 static const MCPhysReg XMMArgRegs64Bit[] = {
2779 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2780 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2782 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2785 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2786 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2787 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2788 return A.getValNo() < B.getValNo();
2792 SDValue X86TargetLowering::LowerFormalArguments(
2793 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2794 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2795 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2796 MachineFunction &MF = DAG.getMachineFunction();
2797 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2798 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2800 const Function *Fn = MF.getFunction();
2801 if (Fn->hasExternalLinkage() &&
2802 Subtarget.isTargetCygMing() &&
2803 Fn->getName() == "main")
2804 FuncInfo->setForceFramePointer(true);
2806 MachineFrameInfo &MFI = MF.getFrameInfo();
2807 bool Is64Bit = Subtarget.is64Bit();
2808 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2811 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2812 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2814 if (CallConv == CallingConv::X86_INTR) {
2815 bool isLegal = Ins.size() == 1 ||
2816 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2817 (!Is64Bit && Ins[1].VT == MVT::i32)));
2819 report_fatal_error("X86 interrupts may take one or two arguments");
2822 // Assign locations to all of the incoming arguments.
2823 SmallVector<CCValAssign, 16> ArgLocs;
2824 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2826 // Allocate shadow area for Win64.
2828 CCInfo.AllocateStack(32, 8);
2830 CCInfo.AnalyzeArguments(Ins, CC_X86);
2832 // In vectorcall calling convention a second pass is required for the HVA
2834 if (CallingConv::X86_VectorCall == CallConv) {
2835 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2838 // The next loop assumes that the locations are in the same order of the
2840 if (!isSortedByValueNo(ArgLocs))
2841 llvm_unreachable("Argument Location list must be sorted before lowering");
2844 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2846 assert(InsIndex < Ins.size() && "Invalid Ins index");
2847 CCValAssign &VA = ArgLocs[I];
2849 if (VA.isRegLoc()) {
2850 EVT RegVT = VA.getLocVT();
2851 if (VA.needsCustom()) {
2853 VA.getValVT() == MVT::v64i1 &&
2854 "Currently the only custom case is when we split v64i1 to 2 regs");
2856 // v64i1 values, in regcall calling convention, that are
2857 // compiled to 32 bit arch, are splited up into two registers.
2859 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2861 const TargetRegisterClass *RC;
2862 if (RegVT == MVT::i32)
2863 RC = &X86::GR32RegClass;
2864 else if (Is64Bit && RegVT == MVT::i64)
2865 RC = &X86::GR64RegClass;
2866 else if (RegVT == MVT::f32)
2867 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2868 else if (RegVT == MVT::f64)
2869 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2870 else if (RegVT == MVT::f80)
2871 RC = &X86::RFP80RegClass;
2872 else if (RegVT == MVT::f128)
2873 RC = &X86::FR128RegClass;
2874 else if (RegVT.is512BitVector())
2875 RC = &X86::VR512RegClass;
2876 else if (RegVT.is256BitVector())
2877 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2878 else if (RegVT.is128BitVector())
2879 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2880 else if (RegVT == MVT::x86mmx)
2881 RC = &X86::VR64RegClass;
2882 else if (RegVT == MVT::i1)
2883 RC = &X86::VK1RegClass;
2884 else if (RegVT == MVT::v8i1)
2885 RC = &X86::VK8RegClass;
2886 else if (RegVT == MVT::v16i1)
2887 RC = &X86::VK16RegClass;
2888 else if (RegVT == MVT::v32i1)
2889 RC = &X86::VK32RegClass;
2890 else if (RegVT == MVT::v64i1)
2891 RC = &X86::VK64RegClass;
2893 llvm_unreachable("Unknown argument type!");
2895 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2896 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2899 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2900 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2902 if (VA.getLocInfo() == CCValAssign::SExt)
2903 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2904 DAG.getValueType(VA.getValVT()));
2905 else if (VA.getLocInfo() == CCValAssign::ZExt)
2906 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2907 DAG.getValueType(VA.getValVT()));
2908 else if (VA.getLocInfo() == CCValAssign::BCvt)
2909 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2911 if (VA.isExtInLoc()) {
2912 // Handle MMX values passed in XMM regs.
2913 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2914 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2915 else if (VA.getValVT().isVector() &&
2916 VA.getValVT().getScalarType() == MVT::i1 &&
2917 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2918 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2919 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2920 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2922 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2925 assert(VA.isMemLoc());
2927 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
2930 // If value is passed via pointer - do a load.
2931 if (VA.getLocInfo() == CCValAssign::Indirect)
2933 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2935 InVals.push_back(ArgValue);
2938 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
2939 // Swift calling convention does not require we copy the sret argument
2940 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2941 if (CallConv == CallingConv::Swift)
2944 // All x86 ABIs require that for returning structs by value we copy the
2945 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2946 // the argument into a virtual register so that we can access it from the
2948 if (Ins[I].Flags.isSRet()) {
2949 unsigned Reg = FuncInfo->getSRetReturnReg();
2951 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2952 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2953 FuncInfo->setSRetReturnReg(Reg);
2955 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
2956 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2961 unsigned StackSize = CCInfo.getNextStackOffset();
2962 // Align stack specially for tail calls.
2963 if (shouldGuaranteeTCO(CallConv,
2964 MF.getTarget().Options.GuaranteedTailCallOpt))
2965 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2967 // If the function takes variable number of arguments, make a frame index for
2968 // the start of the first vararg value... for expansion of llvm.va_start. We
2969 // can skip this if there are no va_start calls.
2970 if (MFI.hasVAStart() &&
2971 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2972 CallConv != CallingConv::X86_ThisCall))) {
2973 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
2976 // Figure out if XMM registers are in use.
2977 assert(!(Subtarget.useSoftFloat() &&
2978 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2979 "SSE register cannot be used when SSE is disabled!");
2981 // 64-bit calling conventions support varargs and register parameters, so we
2982 // have to do extra work to spill them in the prologue.
2983 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
2984 // Find the first unallocated argument registers.
2985 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2986 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2987 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2988 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2989 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2990 "SSE register cannot be used when SSE is disabled!");
2992 // Gather all the live in physical registers.
2993 SmallVector<SDValue, 6> LiveGPRs;
2994 SmallVector<SDValue, 8> LiveXMMRegs;
2996 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2997 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2999 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3001 if (!ArgXMMs.empty()) {
3002 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3003 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3004 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3005 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3006 LiveXMMRegs.push_back(
3007 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3012 // Get to the caller-allocated home save location. Add 8 to account
3013 // for the return address.
3014 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3015 FuncInfo->setRegSaveFrameIndex(
3016 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3017 // Fixup to set vararg frame on shadow area (4 x i64).
3019 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3021 // For X86-64, if there are vararg parameters that are passed via
3022 // registers, then we must store them to their spots on the stack so
3023 // they may be loaded by dereferencing the result of va_next.
3024 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3025 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3026 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3027 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3030 // Store the integer parameter registers.
3031 SmallVector<SDValue, 8> MemOps;
3032 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3033 getPointerTy(DAG.getDataLayout()));
3034 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3035 for (SDValue Val : LiveGPRs) {
3036 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3037 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3039 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3040 MachinePointerInfo::getFixedStack(
3041 DAG.getMachineFunction(),
3042 FuncInfo->getRegSaveFrameIndex(), Offset));
3043 MemOps.push_back(Store);
3047 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3048 // Now store the XMM (fp + vector) parameter registers.
3049 SmallVector<SDValue, 12> SaveXMMOps;
3050 SaveXMMOps.push_back(Chain);
3051 SaveXMMOps.push_back(ALVal);
3052 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3053 FuncInfo->getRegSaveFrameIndex(), dl));
3054 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3055 FuncInfo->getVarArgsFPOffset(), dl));
3056 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3058 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3059 MVT::Other, SaveXMMOps));
3062 if (!MemOps.empty())
3063 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3066 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3067 // Find the largest legal vector type.
3068 MVT VecVT = MVT::Other;
3069 // FIXME: Only some x86_32 calling conventions support AVX512.
3070 if (Subtarget.hasAVX512() &&
3071 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3072 CallConv == CallingConv::Intel_OCL_BI)))
3073 VecVT = MVT::v16f32;
3074 else if (Subtarget.hasAVX())
3076 else if (Subtarget.hasSSE2())
3079 // We forward some GPRs and some vector types.
3080 SmallVector<MVT, 2> RegParmTypes;
3081 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3082 RegParmTypes.push_back(IntVT);
3083 if (VecVT != MVT::Other)
3084 RegParmTypes.push_back(VecVT);
3086 // Compute the set of forwarded registers. The rest are scratch.
3087 SmallVectorImpl<ForwardedRegister> &Forwards =
3088 FuncInfo->getForwardedMustTailRegParms();
3089 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3091 // Conservatively forward AL on x86_64, since it might be used for varargs.
3092 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3093 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3094 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3097 // Copy all forwards from physical to virtual registers.
3098 for (ForwardedRegister &F : Forwards) {
3099 // FIXME: Can we use a less constrained schedule?
3100 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3101 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3102 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3106 // Some CCs need callee pop.
3107 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3108 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3109 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3110 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3111 // X86 interrupts must pop the error code if present
3112 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
3114 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3115 // If this is an sret function, the return should pop the hidden pointer.
3116 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3117 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3118 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3119 FuncInfo->setBytesToPopOnReturn(4);
3123 // RegSaveFrameIndex is X86-64 only.
3124 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3125 if (CallConv == CallingConv::X86_FastCall ||
3126 CallConv == CallingConv::X86_ThisCall)
3127 // fastcc functions can't have varargs.
3128 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3131 FuncInfo->setArgumentStackSize(StackSize);
3133 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3134 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3135 if (Personality == EHPersonality::CoreCLR) {
3137 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3138 // that we'd prefer this slot be allocated towards the bottom of the frame
3139 // (i.e. near the stack pointer after allocating the frame). Every
3140 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3141 // offset from the bottom of this and each funclet's frame must be the
3142 // same, so the size of funclets' (mostly empty) frames is dictated by
3143 // how far this slot is from the bottom (since they allocate just enough
3144 // space to accommodate holding this slot at the correct offset).
3145 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3146 EHInfo->PSPSymFrameIdx = PSPSymFI;
3153 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3154 SDValue Arg, const SDLoc &dl,
3156 const CCValAssign &VA,
3157 ISD::ArgFlagsTy Flags) const {
3158 unsigned LocMemOffset = VA.getLocMemOffset();
3159 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3160 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3162 if (Flags.isByVal())
3163 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3165 return DAG.getStore(
3166 Chain, dl, Arg, PtrOff,
3167 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3170 /// Emit a load of return address if tail call
3171 /// optimization is performed and it is required.
3172 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3173 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3174 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3175 // Adjust the Return address stack slot.
3176 EVT VT = getPointerTy(DAG.getDataLayout());
3177 OutRetAddr = getReturnAddressFrameIndex(DAG);
3179 // Load the "old" Return address.
3180 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3181 return SDValue(OutRetAddr.getNode(), 1);
3184 /// Emit a store of the return address if tail call
3185 /// optimization is performed and it is required (FPDiff!=0).
3186 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3187 SDValue Chain, SDValue RetAddrFrIdx,
3188 EVT PtrVT, unsigned SlotSize,
3189 int FPDiff, const SDLoc &dl) {
3190 // Store the return address to the appropriate stack slot.
3191 if (!FPDiff) return Chain;
3192 // Calculate the new stack slot for the return address.
3193 int NewReturnAddrFI =
3194 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3196 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3197 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3198 MachinePointerInfo::getFixedStack(
3199 DAG.getMachineFunction(), NewReturnAddrFI));
3203 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3204 /// operation of specified width.
3205 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3207 unsigned NumElems = VT.getVectorNumElements();
3208 SmallVector<int, 8> Mask;
3209 Mask.push_back(NumElems);
3210 for (unsigned i = 1; i != NumElems; ++i)
3212 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3216 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3217 SmallVectorImpl<SDValue> &InVals) const {
3218 SelectionDAG &DAG = CLI.DAG;
3220 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3221 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3222 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3223 SDValue Chain = CLI.Chain;
3224 SDValue Callee = CLI.Callee;
3225 CallingConv::ID CallConv = CLI.CallConv;
3226 bool &isTailCall = CLI.IsTailCall;
3227 bool isVarArg = CLI.IsVarArg;
3229 MachineFunction &MF = DAG.getMachineFunction();
3230 bool Is64Bit = Subtarget.is64Bit();
3231 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3232 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3233 bool IsSibcall = false;
3234 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3235 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3237 if (CallConv == CallingConv::X86_INTR)
3238 report_fatal_error("X86 interrupts may not be called directly");
3240 if (Attr.getValueAsString() == "true")
3243 if (Subtarget.isPICStyleGOT() &&
3244 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3245 // If we are using a GOT, disable tail calls to external symbols with
3246 // default visibility. Tail calling such a symbol requires using a GOT
3247 // relocation, which forces early binding of the symbol. This breaks code
3248 // that require lazy function symbol resolution. Using musttail or
3249 // GuaranteedTailCallOpt will override this.
3250 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3251 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3252 G->getGlobal()->hasDefaultVisibility()))
3256 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3258 // Force this to be a tail call. The verifier rules are enough to ensure
3259 // that we can lower this successfully without moving the return address
3262 } else if (isTailCall) {
3263 // Check if it's really possible to do a tail call.
3264 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3265 isVarArg, SR != NotStructReturn,
3266 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3267 Outs, OutVals, Ins, DAG);
3269 // Sibcalls are automatically detected tailcalls which do not require
3271 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3278 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3279 "Var args not supported with calling convention fastcc, ghc or hipe");
3281 // Analyze operands of the call, assigning locations to each operand.
3282 SmallVector<CCValAssign, 16> ArgLocs;
3283 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3285 // Allocate shadow area for Win64.
3287 CCInfo.AllocateStack(32, 8);
3289 CCInfo.AnalyzeArguments(Outs, CC_X86);
3291 // In vectorcall calling convention a second pass is required for the HVA
3293 if (CallingConv::X86_VectorCall == CallConv) {
3294 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3297 // Get a count of how many bytes are to be pushed on the stack.
3298 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3300 // This is a sibcall. The memory operands are available in caller's
3301 // own caller's stack.
3303 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3304 canGuaranteeTCO(CallConv))
3305 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3308 if (isTailCall && !IsSibcall && !IsMustTail) {
3309 // Lower arguments at fp - stackoffset + fpdiff.
3310 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3312 FPDiff = NumBytesCallerPushed - NumBytes;
3314 // Set the delta of movement of the returnaddr stackslot.
3315 // But only set if delta is greater than previous delta.
3316 if (FPDiff < X86Info->getTCReturnAddrDelta())
3317 X86Info->setTCReturnAddrDelta(FPDiff);
3320 unsigned NumBytesToPush = NumBytes;
3321 unsigned NumBytesToPop = NumBytes;
3323 // If we have an inalloca argument, all stack space has already been allocated
3324 // for us and be right at the top of the stack. We don't support multiple
3325 // arguments passed in memory when using inalloca.
3326 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3328 if (!ArgLocs.back().isMemLoc())
3329 report_fatal_error("cannot use inalloca attribute on a register "
3331 if (ArgLocs.back().getLocMemOffset() != 0)
3332 report_fatal_error("any parameter with the inalloca attribute must be "
3333 "the only memory argument");
3337 Chain = DAG.getCALLSEQ_START(
3338 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3340 SDValue RetAddrFrIdx;
3341 // Load return address for tail calls.
3342 if (isTailCall && FPDiff)
3343 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3344 Is64Bit, FPDiff, dl);
3346 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3347 SmallVector<SDValue, 8> MemOpChains;
3350 // The next loop assumes that the locations are in the same order of the
3352 if (!isSortedByValueNo(ArgLocs))
3353 llvm_unreachable("Argument Location list must be sorted before lowering");
3355 // Walk the register/memloc assignments, inserting copies/loads. In the case
3356 // of tail call optimization arguments are handle later.
3357 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3358 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3360 assert(OutIndex < Outs.size() && "Invalid Out index");
3361 // Skip inalloca arguments, they have already been written.
3362 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3363 if (Flags.isInAlloca())
3366 CCValAssign &VA = ArgLocs[I];
3367 EVT RegVT = VA.getLocVT();
3368 SDValue Arg = OutVals[OutIndex];
3369 bool isByVal = Flags.isByVal();
3371 // Promote the value if needed.
3372 switch (VA.getLocInfo()) {
3373 default: llvm_unreachable("Unknown loc info!");
3374 case CCValAssign::Full: break;
3375 case CCValAssign::SExt:
3376 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3378 case CCValAssign::ZExt:
3379 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3381 case CCValAssign::AExt:
3382 if (Arg.getValueType().isVector() &&
3383 Arg.getValueType().getVectorElementType() == MVT::i1)
3384 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3385 else if (RegVT.is128BitVector()) {
3386 // Special case: passing MMX values in XMM registers.
3387 Arg = DAG.getBitcast(MVT::i64, Arg);
3388 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3389 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3391 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3393 case CCValAssign::BCvt:
3394 Arg = DAG.getBitcast(RegVT, Arg);
3396 case CCValAssign::Indirect: {
3397 // Store the argument.
3398 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3399 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3400 Chain = DAG.getStore(
3401 Chain, dl, Arg, SpillSlot,
3402 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3408 if (VA.needsCustom()) {
3409 assert(VA.getValVT() == MVT::v64i1 &&
3410 "Currently the only custom case is when we split v64i1 to 2 regs");
3411 // Split v64i1 value into two registers
3412 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3414 } else if (VA.isRegLoc()) {
3415 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3416 if (isVarArg && IsWin64) {
3417 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3418 // shadow reg if callee is a varargs function.
3419 unsigned ShadowReg = 0;
3420 switch (VA.getLocReg()) {
3421 case X86::XMM0: ShadowReg = X86::RCX; break;
3422 case X86::XMM1: ShadowReg = X86::RDX; break;
3423 case X86::XMM2: ShadowReg = X86::R8; break;
3424 case X86::XMM3: ShadowReg = X86::R9; break;
3427 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3429 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3430 assert(VA.isMemLoc());
3431 if (!StackPtr.getNode())
3432 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3433 getPointerTy(DAG.getDataLayout()));
3434 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3435 dl, DAG, VA, Flags));
3439 if (!MemOpChains.empty())
3440 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3442 if (Subtarget.isPICStyleGOT()) {
3443 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3446 RegsToPass.push_back(std::make_pair(
3447 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3448 getPointerTy(DAG.getDataLayout()))));
3450 // If we are tail calling and generating PIC/GOT style code load the
3451 // address of the callee into ECX. The value in ecx is used as target of
3452 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3453 // for tail calls on PIC/GOT architectures. Normally we would just put the
3454 // address of GOT into ebx and then call target@PLT. But for tail calls
3455 // ebx would be restored (since ebx is callee saved) before jumping to the
3458 // Note: The actual moving to ECX is done further down.
3459 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3460 if (G && !G->getGlobal()->hasLocalLinkage() &&
3461 G->getGlobal()->hasDefaultVisibility())
3462 Callee = LowerGlobalAddress(Callee, DAG);
3463 else if (isa<ExternalSymbolSDNode>(Callee))
3464 Callee = LowerExternalSymbol(Callee, DAG);
3468 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3469 // From AMD64 ABI document:
3470 // For calls that may call functions that use varargs or stdargs
3471 // (prototype-less calls or calls to functions containing ellipsis (...) in
3472 // the declaration) %al is used as hidden argument to specify the number
3473 // of SSE registers used. The contents of %al do not need to match exactly
3474 // the number of registers, but must be an ubound on the number of SSE
3475 // registers used and is in the range 0 - 8 inclusive.
3477 // Count the number of XMM registers allocated.
3478 static const MCPhysReg XMMArgRegs[] = {
3479 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3480 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3482 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3483 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3484 && "SSE registers cannot be used when SSE is disabled");
3486 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3487 DAG.getConstant(NumXMMRegs, dl,
3491 if (isVarArg && IsMustTail) {
3492 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3493 for (const auto &F : Forwards) {
3494 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3495 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3499 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3500 // don't need this because the eligibility check rejects calls that require
3501 // shuffling arguments passed in memory.
3502 if (!IsSibcall && isTailCall) {
3503 // Force all the incoming stack arguments to be loaded from the stack
3504 // before any new outgoing arguments are stored to the stack, because the
3505 // outgoing stack slots may alias the incoming argument stack slots, and
3506 // the alias isn't otherwise explicit. This is slightly more conservative
3507 // than necessary, because it means that each store effectively depends
3508 // on every argument instead of just those arguments it would clobber.
3509 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3511 SmallVector<SDValue, 8> MemOpChains2;
3514 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3516 CCValAssign &VA = ArgLocs[I];
3518 if (VA.isRegLoc()) {
3519 if (VA.needsCustom()) {
3520 assert((CallConv == CallingConv::X86_RegCall) &&
3521 "Expecting custome case only in regcall calling convention");
3522 // This means that we are in special case where one argument was
3523 // passed through two register locations - Skip the next location
3530 assert(VA.isMemLoc());
3531 SDValue Arg = OutVals[OutsIndex];
3532 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3533 // Skip inalloca arguments. They don't require any work.
3534 if (Flags.isInAlloca())
3536 // Create frame index.
3537 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3538 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3539 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3540 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3542 if (Flags.isByVal()) {
3543 // Copy relative to framepointer.
3544 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3545 if (!StackPtr.getNode())
3546 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3547 getPointerTy(DAG.getDataLayout()));
3548 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3551 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3555 // Store relative to framepointer.
3556 MemOpChains2.push_back(DAG.getStore(
3557 ArgChain, dl, Arg, FIN,
3558 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3562 if (!MemOpChains2.empty())
3563 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3565 // Store the return address to the appropriate stack slot.
3566 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3567 getPointerTy(DAG.getDataLayout()),
3568 RegInfo->getSlotSize(), FPDiff, dl);
3571 // Build a sequence of copy-to-reg nodes chained together with token chain
3572 // and flag operands which copy the outgoing args into registers.
3574 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3575 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3576 RegsToPass[i].second, InFlag);
3577 InFlag = Chain.getValue(1);
3580 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3581 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3582 // In the 64-bit large code model, we have to make all calls
3583 // through a register, since the call instruction's 32-bit
3584 // pc-relative offset may not be large enough to hold the whole
3586 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3587 // If the callee is a GlobalAddress node (quite common, every direct call
3588 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3590 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3592 // We should use extra load for direct calls to dllimported functions in
3594 const GlobalValue *GV = G->getGlobal();
3595 if (!GV->hasDLLImportStorageClass()) {
3596 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3598 Callee = DAG.getTargetGlobalAddress(
3599 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3601 if (OpFlags == X86II::MO_GOTPCREL) {
3603 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3604 getPointerTy(DAG.getDataLayout()), Callee);
3605 // Add extra indirection
3606 Callee = DAG.getLoad(
3607 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3608 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3611 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3612 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3613 unsigned char OpFlags =
3614 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3616 Callee = DAG.getTargetExternalSymbol(
3617 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3618 } else if (Subtarget.isTarget64BitILP32() &&
3619 Callee->getValueType(0) == MVT::i32) {
3620 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3621 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3624 // Returns a chain & a flag for retval copy to use.
3625 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3626 SmallVector<SDValue, 8> Ops;
3628 if (!IsSibcall && isTailCall) {
3629 Chain = DAG.getCALLSEQ_END(Chain,
3630 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3631 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3632 InFlag = Chain.getValue(1);
3635 Ops.push_back(Chain);
3636 Ops.push_back(Callee);
3639 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3641 // Add argument registers to the end of the list so that they are known live
3643 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3644 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3645 RegsToPass[i].second.getValueType()));
3647 // Add a register mask operand representing the call-preserved registers.
3648 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3649 assert(Mask && "Missing call preserved mask for calling convention");
3651 // If this is an invoke in a 32-bit function using a funclet-based
3652 // personality, assume the function clobbers all registers. If an exception
3653 // is thrown, the runtime will not restore CSRs.
3654 // FIXME: Model this more precisely so that we can register allocate across
3655 // the normal edge and spill and fill across the exceptional edge.
3656 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3657 const Function *CallerFn = MF.getFunction();
3658 EHPersonality Pers =
3659 CallerFn->hasPersonalityFn()
3660 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3661 : EHPersonality::Unknown;
3662 if (isFuncletEHPersonality(Pers))
3663 Mask = RegInfo->getNoPreservedMask();
3666 Ops.push_back(DAG.getRegisterMask(Mask));
3668 if (InFlag.getNode())
3669 Ops.push_back(InFlag);
3673 //// If this is the first return lowered for this function, add the regs
3674 //// to the liveout set for the function.
3675 // This isn't right, although it's probably harmless on x86; liveouts
3676 // should be computed from returns not tail calls. Consider a void
3677 // function making a tail call to a function returning int.
3678 MF.getFrameInfo().setHasTailCall();
3679 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3682 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3683 InFlag = Chain.getValue(1);
3685 // Create the CALLSEQ_END node.
3686 unsigned NumBytesForCalleeToPop;
3687 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3688 DAG.getTarget().Options.GuaranteedTailCallOpt))
3689 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3690 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3691 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3692 SR == StackStructReturn)
3693 // If this is a call to a struct-return function, the callee
3694 // pops the hidden struct pointer, so we have to push it back.
3695 // This is common for Darwin/X86, Linux & Mingw32 targets.
3696 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3697 NumBytesForCalleeToPop = 4;
3699 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3701 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3702 // No need to reset the stack after the call if the call doesn't return. To
3703 // make the MI verify, we'll pretend the callee does it for us.
3704 NumBytesForCalleeToPop = NumBytes;
3707 // Returns a flag for retval copy to use.
3709 Chain = DAG.getCALLSEQ_END(Chain,
3710 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3711 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3714 InFlag = Chain.getValue(1);
3717 // Handle result values, copying them out of physregs into vregs that we
3719 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3720 Ins, dl, DAG, InVals);
3723 //===----------------------------------------------------------------------===//
3724 // Fast Calling Convention (tail call) implementation
3725 //===----------------------------------------------------------------------===//
3727 // Like std call, callee cleans arguments, convention except that ECX is
3728 // reserved for storing the tail called function address. Only 2 registers are
3729 // free for argument passing (inreg). Tail call optimization is performed
3731 // * tailcallopt is enabled
3732 // * caller/callee are fastcc
3733 // On X86_64 architecture with GOT-style position independent code only local
3734 // (within module) calls are supported at the moment.
3735 // To keep the stack aligned according to platform abi the function
3736 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3737 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3738 // If a tail called function callee has more arguments than the caller the
3739 // caller needs to make sure that there is room to move the RETADDR to. This is
3740 // achieved by reserving an area the size of the argument delta right after the
3741 // original RETADDR, but before the saved framepointer or the spilled registers
3742 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3754 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3757 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3758 SelectionDAG& DAG) const {
3759 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3760 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3761 unsigned StackAlignment = TFI.getStackAlignment();
3762 uint64_t AlignMask = StackAlignment - 1;
3763 int64_t Offset = StackSize;
3764 unsigned SlotSize = RegInfo->getSlotSize();
3765 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3766 // Number smaller than 12 so just add the difference.
3767 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3769 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3770 Offset = ((~AlignMask) & Offset) + StackAlignment +
3771 (StackAlignment-SlotSize);
3776 /// Return true if the given stack call argument is already available in the
3777 /// same position (relatively) of the caller's incoming argument stack.
3779 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3780 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3781 const X86InstrInfo *TII, const CCValAssign &VA) {
3782 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3785 // Look through nodes that don't alter the bits of the incoming value.
3786 unsigned Op = Arg.getOpcode();
3787 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3788 Arg = Arg.getOperand(0);
3791 if (Op == ISD::TRUNCATE) {
3792 const SDValue &TruncInput = Arg.getOperand(0);
3793 if (TruncInput.getOpcode() == ISD::AssertZext &&
3794 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3795 Arg.getValueType()) {
3796 Arg = TruncInput.getOperand(0);
3804 if (Arg.getOpcode() == ISD::CopyFromReg) {
3805 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3806 if (!TargetRegisterInfo::isVirtualRegister(VR))
3808 MachineInstr *Def = MRI->getVRegDef(VR);
3811 if (!Flags.isByVal()) {
3812 if (!TII->isLoadFromStackSlot(*Def, FI))
3815 unsigned Opcode = Def->getOpcode();
3816 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3817 Opcode == X86::LEA64_32r) &&
3818 Def->getOperand(1).isFI()) {
3819 FI = Def->getOperand(1).getIndex();
3820 Bytes = Flags.getByValSize();
3824 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3825 if (Flags.isByVal())
3826 // ByVal argument is passed in as a pointer but it's now being
3827 // dereferenced. e.g.
3828 // define @foo(%struct.X* %A) {
3829 // tail call @bar(%struct.X* byval %A)
3832 SDValue Ptr = Ld->getBasePtr();
3833 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3836 FI = FINode->getIndex();
3837 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3838 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3839 FI = FINode->getIndex();
3840 Bytes = Flags.getByValSize();
3844 assert(FI != INT_MAX);
3845 if (!MFI.isFixedObjectIndex(FI))
3848 if (Offset != MFI.getObjectOffset(FI))
3851 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3852 // If the argument location is wider than the argument type, check that any
3853 // extension flags match.
3854 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3855 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3860 return Bytes == MFI.getObjectSize(FI);
3863 /// Check whether the call is eligible for tail call optimization. Targets
3864 /// that want to do tail call optimization should implement this function.
3865 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3866 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3867 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3868 const SmallVectorImpl<ISD::OutputArg> &Outs,
3869 const SmallVectorImpl<SDValue> &OutVals,
3870 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3871 if (!mayTailCallThisCC(CalleeCC))
3874 // If -tailcallopt is specified, make fastcc functions tail-callable.
3875 MachineFunction &MF = DAG.getMachineFunction();
3876 const Function *CallerF = MF.getFunction();
3878 // If the function return type is x86_fp80 and the callee return type is not,
3879 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3880 // perform a tailcall optimization here.
3881 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3884 CallingConv::ID CallerCC = CallerF->getCallingConv();
3885 bool CCMatch = CallerCC == CalleeCC;
3886 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3887 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3889 // Win64 functions have extra shadow space for argument homing. Don't do the
3890 // sibcall if the caller and callee have mismatched expectations for this
3892 if (IsCalleeWin64 != IsCallerWin64)
3895 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3896 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3901 // Look for obvious safe cases to perform tail call optimization that do not
3902 // require ABI changes. This is what gcc calls sibcall.
3904 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3905 // emit a special epilogue.
3906 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3907 if (RegInfo->needsStackRealignment(MF))
3910 // Also avoid sibcall optimization if either caller or callee uses struct
3911 // return semantics.
3912 if (isCalleeStructRet || isCallerStructRet)
3915 // Do not sibcall optimize vararg calls unless all arguments are passed via
3917 LLVMContext &C = *DAG.getContext();
3918 if (isVarArg && !Outs.empty()) {
3919 // Optimizing for varargs on Win64 is unlikely to be safe without
3920 // additional testing.
3921 if (IsCalleeWin64 || IsCallerWin64)
3924 SmallVector<CCValAssign, 16> ArgLocs;
3925 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3927 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3928 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3929 if (!ArgLocs[i].isRegLoc())
3933 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3934 // stack. Therefore, if it's not used by the call it is not safe to optimize
3935 // this into a sibcall.
3936 bool Unused = false;
3937 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3944 SmallVector<CCValAssign, 16> RVLocs;
3945 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3946 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3947 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3948 CCValAssign &VA = RVLocs[i];
3949 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3954 // Check that the call results are passed in the same way.
3955 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3956 RetCC_X86, RetCC_X86))
3958 // The callee has to preserve all registers the caller needs to preserve.
3959 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3960 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3962 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3963 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3967 unsigned StackArgsSize = 0;
3969 // If the callee takes no arguments then go on to check the results of the
3971 if (!Outs.empty()) {
3972 // Check if stack adjustment is needed. For now, do not do this if any
3973 // argument is passed on the stack.
3974 SmallVector<CCValAssign, 16> ArgLocs;
3975 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3977 // Allocate shadow area for Win64
3979 CCInfo.AllocateStack(32, 8);
3981 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3982 StackArgsSize = CCInfo.getNextStackOffset();
3984 if (CCInfo.getNextStackOffset()) {
3985 // Check if the arguments are already laid out in the right way as
3986 // the caller's fixed stack objects.
3987 MachineFrameInfo &MFI = MF.getFrameInfo();
3988 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3989 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3990 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3991 CCValAssign &VA = ArgLocs[i];
3992 SDValue Arg = OutVals[i];
3993 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3994 if (VA.getLocInfo() == CCValAssign::Indirect)
3996 if (!VA.isRegLoc()) {
3997 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4004 bool PositionIndependent = isPositionIndependent();
4005 // If the tailcall address may be in a register, then make sure it's
4006 // possible to register allocate for it. In 32-bit, the call address can
4007 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4008 // callee-saved registers are restored. These happen to be the same
4009 // registers used to pass 'inreg' arguments so watch out for those.
4010 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4011 !isa<ExternalSymbolSDNode>(Callee)) ||
4012 PositionIndependent)) {
4013 unsigned NumInRegs = 0;
4014 // In PIC we need an extra register to formulate the address computation
4016 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4018 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4019 CCValAssign &VA = ArgLocs[i];
4022 unsigned Reg = VA.getLocReg();
4025 case X86::EAX: case X86::EDX: case X86::ECX:
4026 if (++NumInRegs == MaxInRegs)
4033 const MachineRegisterInfo &MRI = MF.getRegInfo();
4034 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4038 bool CalleeWillPop =
4039 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4040 MF.getTarget().Options.GuaranteedTailCallOpt);
4042 if (unsigned BytesToPop =
4043 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4044 // If we have bytes to pop, the callee must pop them.
4045 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4046 if (!CalleePopMatches)
4048 } else if (CalleeWillPop && StackArgsSize > 0) {
4049 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4057 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4058 const TargetLibraryInfo *libInfo) const {
4059 return X86::createFastISel(funcInfo, libInfo);
4062 //===----------------------------------------------------------------------===//
4063 // Other Lowering Hooks
4064 //===----------------------------------------------------------------------===//
4066 static bool MayFoldLoad(SDValue Op) {
4067 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4070 static bool MayFoldIntoStore(SDValue Op) {
4071 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4074 static bool MayFoldIntoZeroExtend(SDValue Op) {
4075 if (Op.hasOneUse()) {
4076 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4077 return (ISD::ZERO_EXTEND == Opcode);
4082 static bool isTargetShuffle(unsigned Opcode) {
4084 default: return false;
4085 case X86ISD::BLENDI:
4086 case X86ISD::PSHUFB:
4087 case X86ISD::PSHUFD:
4088 case X86ISD::PSHUFHW:
4089 case X86ISD::PSHUFLW:
4091 case X86ISD::INSERTPS:
4092 case X86ISD::PALIGNR:
4093 case X86ISD::VSHLDQ:
4094 case X86ISD::VSRLDQ:
4095 case X86ISD::MOVLHPS:
4096 case X86ISD::MOVLHPD:
4097 case X86ISD::MOVHLPS:
4098 case X86ISD::MOVLPS:
4099 case X86ISD::MOVLPD:
4100 case X86ISD::MOVSHDUP:
4101 case X86ISD::MOVSLDUP:
4102 case X86ISD::MOVDDUP:
4105 case X86ISD::UNPCKL:
4106 case X86ISD::UNPCKH:
4107 case X86ISD::VBROADCAST:
4108 case X86ISD::VPERMILPI:
4109 case X86ISD::VPERMILPV:
4110 case X86ISD::VPERM2X128:
4111 case X86ISD::VPERMIL2:
4112 case X86ISD::VPERMI:
4113 case X86ISD::VPPERM:
4114 case X86ISD::VPERMV:
4115 case X86ISD::VPERMV3:
4116 case X86ISD::VPERMIV3:
4117 case X86ISD::VZEXT_MOVL:
4122 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4124 default: return false;
4126 case X86ISD::PSHUFB:
4127 case X86ISD::VPERMILPV:
4128 case X86ISD::VPERMIL2:
4129 case X86ISD::VPPERM:
4130 case X86ISD::VPERMV:
4131 case X86ISD::VPERMV3:
4132 case X86ISD::VPERMIV3:
4134 // 'Faux' Target Shuffles.
4140 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4141 MachineFunction &MF = DAG.getMachineFunction();
4142 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4143 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4144 int ReturnAddrIndex = FuncInfo->getRAIndex();
4146 if (ReturnAddrIndex == 0) {
4147 // Set up a frame object for the return address.
4148 unsigned SlotSize = RegInfo->getSlotSize();
4149 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4152 FuncInfo->setRAIndex(ReturnAddrIndex);
4155 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4158 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4159 bool hasSymbolicDisplacement) {
4160 // Offset should fit into 32 bit immediate field.
4161 if (!isInt<32>(Offset))
4164 // If we don't have a symbolic displacement - we don't have any extra
4166 if (!hasSymbolicDisplacement)
4169 // FIXME: Some tweaks might be needed for medium code model.
4170 if (M != CodeModel::Small && M != CodeModel::Kernel)
4173 // For small code model we assume that latest object is 16MB before end of 31
4174 // bits boundary. We may also accept pretty large negative constants knowing
4175 // that all objects are in the positive half of address space.
4176 if (M == CodeModel::Small && Offset < 16*1024*1024)
4179 // For kernel code model we know that all object resist in the negative half
4180 // of 32bits address space. We may not accept negative offsets, since they may
4181 // be just off and we may accept pretty large positive ones.
4182 if (M == CodeModel::Kernel && Offset >= 0)
4188 /// Determines whether the callee is required to pop its own arguments.
4189 /// Callee pop is necessary to support tail calls.
4190 bool X86::isCalleePop(CallingConv::ID CallingConv,
4191 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4192 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4193 // can guarantee TCO.
4194 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4197 switch (CallingConv) {
4200 case CallingConv::X86_StdCall:
4201 case CallingConv::X86_FastCall:
4202 case CallingConv::X86_ThisCall:
4203 case CallingConv::X86_VectorCall:
4208 /// \brief Return true if the condition is an unsigned comparison operation.
4209 static bool isX86CCUnsigned(unsigned X86CC) {
4212 llvm_unreachable("Invalid integer condition!");
4228 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4229 switch (SetCCOpcode) {
4230 default: llvm_unreachable("Invalid integer condition!");
4231 case ISD::SETEQ: return X86::COND_E;
4232 case ISD::SETGT: return X86::COND_G;
4233 case ISD::SETGE: return X86::COND_GE;
4234 case ISD::SETLT: return X86::COND_L;
4235 case ISD::SETLE: return X86::COND_LE;
4236 case ISD::SETNE: return X86::COND_NE;
4237 case ISD::SETULT: return X86::COND_B;
4238 case ISD::SETUGT: return X86::COND_A;
4239 case ISD::SETULE: return X86::COND_BE;
4240 case ISD::SETUGE: return X86::COND_AE;
4244 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4245 /// condition code, returning the condition code and the LHS/RHS of the
4246 /// comparison to make.
4247 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4248 bool isFP, SDValue &LHS, SDValue &RHS,
4249 SelectionDAG &DAG) {
4251 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4252 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4253 // X > -1 -> X == 0, jump !sign.
4254 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4255 return X86::COND_NS;
4257 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4258 // X < 0 -> X == 0, jump on sign.
4261 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4263 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4264 return X86::COND_LE;
4268 return TranslateIntegerX86CC(SetCCOpcode);
4271 // First determine if it is required or is profitable to flip the operands.
4273 // If LHS is a foldable load, but RHS is not, flip the condition.
4274 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4275 !ISD::isNON_EXTLoad(RHS.getNode())) {
4276 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4277 std::swap(LHS, RHS);
4280 switch (SetCCOpcode) {
4286 std::swap(LHS, RHS);
4290 // On a floating point condition, the flags are set as follows:
4292 // 0 | 0 | 0 | X > Y
4293 // 0 | 0 | 1 | X < Y
4294 // 1 | 0 | 0 | X == Y
4295 // 1 | 1 | 1 | unordered
4296 switch (SetCCOpcode) {
4297 default: llvm_unreachable("Condcode should be pre-legalized away");
4299 case ISD::SETEQ: return X86::COND_E;
4300 case ISD::SETOLT: // flipped
4302 case ISD::SETGT: return X86::COND_A;
4303 case ISD::SETOLE: // flipped
4305 case ISD::SETGE: return X86::COND_AE;
4306 case ISD::SETUGT: // flipped
4308 case ISD::SETLT: return X86::COND_B;
4309 case ISD::SETUGE: // flipped
4311 case ISD::SETLE: return X86::COND_BE;
4313 case ISD::SETNE: return X86::COND_NE;
4314 case ISD::SETUO: return X86::COND_P;
4315 case ISD::SETO: return X86::COND_NP;
4317 case ISD::SETUNE: return X86::COND_INVALID;
4321 /// Is there a floating point cmov for the specific X86 condition code?
4322 /// Current x86 isa includes the following FP cmov instructions:
4323 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4324 static bool hasFPCMov(unsigned X86CC) {
4341 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4343 unsigned Intrinsic) const {
4345 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4349 Info.opc = ISD::INTRINSIC_W_CHAIN;
4350 Info.readMem = false;
4351 Info.writeMem = false;
4355 switch (IntrData->Type) {
4356 case EXPAND_FROM_MEM: {
4357 Info.ptrVal = I.getArgOperand(0);
4358 Info.memVT = MVT::getVT(I.getType());
4360 Info.readMem = true;
4363 case COMPRESS_TO_MEM: {
4364 Info.ptrVal = I.getArgOperand(0);
4365 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4367 Info.writeMem = true;
4370 case TRUNCATE_TO_MEM_VI8:
4371 case TRUNCATE_TO_MEM_VI16:
4372 case TRUNCATE_TO_MEM_VI32: {
4373 Info.ptrVal = I.getArgOperand(0);
4374 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4375 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4376 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4378 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4379 ScalarVT = MVT::i16;
4380 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4381 ScalarVT = MVT::i32;
4383 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4385 Info.writeMem = true;
4395 /// Returns true if the target can instruction select the
4396 /// specified FP immediate natively. If false, the legalizer will
4397 /// materialize the FP immediate as a load from a constant pool.
4398 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4399 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4400 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4406 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4407 ISD::LoadExtType ExtTy,
4409 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4410 // relocation target a movq or addq instruction: don't let the load shrink.
4411 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4412 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4413 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4414 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4418 /// \brief Returns true if it is beneficial to convert a load of a constant
4419 /// to just the constant itself.
4420 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4422 assert(Ty->isIntegerTy());
4424 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4425 if (BitSize == 0 || BitSize > 64)
4430 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4431 unsigned Index) const {
4432 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4435 return (Index == 0 || Index == ResVT.getVectorNumElements());
4438 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4439 // Speculate cttz only if we can directly use TZCNT.
4440 return Subtarget.hasBMI();
4443 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4444 // Speculate ctlz only if we can directly use LZCNT.
4445 return Subtarget.hasLZCNT();
4448 bool X86TargetLowering::isCtlzFast() const {
4449 return Subtarget.hasFastLZCNT();
4452 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4453 if (!Subtarget.hasBMI())
4456 // There are only 32-bit and 64-bit forms for 'andn'.
4457 EVT VT = Y.getValueType();
4458 if (VT != MVT::i32 && VT != MVT::i64)
4464 /// Val is the undef sentinel value or equal to the specified value.
4465 static bool isUndefOrEqual(int Val, int CmpVal) {
4466 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4469 /// Val is either the undef or zero sentinel value.
4470 static bool isUndefOrZero(int Val) {
4471 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4474 /// Return true if every element in Mask, beginning
4475 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4476 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4477 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4478 if (Mask[i] != SM_SentinelUndef)
4483 /// Return true if Val is undef or if its value falls within the
4484 /// specified range (L, H].
4485 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4486 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4489 /// Return true if every element in Mask is undef or if its value
4490 /// falls within the specified range (L, H].
4491 static bool isUndefOrInRange(ArrayRef<int> Mask,
4494 if (!isUndefOrInRange(M, Low, Hi))
4499 /// Return true if Val is undef, zero or if its value falls within the
4500 /// specified range (L, H].
4501 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4502 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4505 /// Return true if every element in Mask is undef, zero or if its value
4506 /// falls within the specified range (L, H].
4507 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4509 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4514 /// Return true if every element in Mask, beginning
4515 /// from position Pos and ending in Pos+Size, falls within the specified
4516 /// sequential range (Low, Low+Size]. or is undef.
4517 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4518 unsigned Pos, unsigned Size, int Low) {
4519 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4520 if (!isUndefOrEqual(Mask[i], Low))
4525 /// Return true if every element in Mask, beginning
4526 /// from position Pos and ending in Pos+Size, falls within the specified
4527 /// sequential range (Low, Low+Size], or is undef or is zero.
4528 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4529 unsigned Size, int Low) {
4530 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4531 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4536 /// Return true if every element in Mask, beginning
4537 /// from position Pos and ending in Pos+Size is undef or is zero.
4538 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4540 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4541 if (!isUndefOrZero(Mask[i]))
4546 /// \brief Helper function to test whether a shuffle mask could be
4547 /// simplified by widening the elements being shuffled.
4549 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4550 /// leaves it in an unspecified state.
4552 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4553 /// shuffle masks. The latter have the special property of a '-2' representing
4554 /// a zero-ed lane of a vector.
4555 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4556 SmallVectorImpl<int> &WidenedMask) {
4557 WidenedMask.assign(Mask.size() / 2, 0);
4558 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4559 // If both elements are undef, its trivial.
4560 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
4561 WidenedMask[i / 2] = SM_SentinelUndef;
4565 // Check for an undef mask and a mask value properly aligned to fit with
4566 // a pair of values. If we find such a case, use the non-undef mask's value.
4567 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
4568 Mask[i + 1] % 2 == 1) {
4569 WidenedMask[i / 2] = Mask[i + 1] / 2;
4572 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
4573 WidenedMask[i / 2] = Mask[i] / 2;
4577 // When zeroing, we need to spread the zeroing across both lanes to widen.
4578 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
4579 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
4580 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
4581 WidenedMask[i / 2] = SM_SentinelZero;
4587 // Finally check if the two mask values are adjacent and aligned with
4589 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
4590 Mask[i] + 1 == Mask[i + 1]) {
4591 WidenedMask[i / 2] = Mask[i] / 2;
4595 // Otherwise we can't safely widen the elements used in this shuffle.
4598 assert(WidenedMask.size() == Mask.size() / 2 &&
4599 "Incorrect size of mask after widening the elements!");
4604 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4605 /// mask index with the scaled sequential indices for an equivalent narrowed
4606 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4608 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4609 SmallVectorImpl<int> &ScaledMask) {
4610 assert(0 < Scale && "Unexpected scaling factor");
4611 int NumElts = Mask.size();
4612 ScaledMask.assign(NumElts * Scale, -1);
4614 for (int i = 0; i != NumElts; ++i) {
4617 // Repeat sentinel values in every mask element.
4619 for (int s = 0; s != Scale; ++s)
4620 ScaledMask[(Scale * i) + s] = M;
4624 // Scale mask element and increment across each mask element.
4625 for (int s = 0; s != Scale; ++s)
4626 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4630 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4631 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4632 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4633 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4634 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4637 // The index should be aligned on a vecWidth-bit boundary.
4639 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4641 MVT VT = N->getSimpleValueType(0);
4642 unsigned ElSize = VT.getScalarSizeInBits();
4643 bool Result = (Index * ElSize) % vecWidth == 0;
4648 /// Return true if the specified INSERT_SUBVECTOR
4649 /// operand specifies a subvector insert that is suitable for input to
4650 /// insertion of 128 or 256-bit subvectors
4651 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4652 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4653 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4655 // The index should be aligned on a vecWidth-bit boundary.
4657 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4659 MVT VT = N->getSimpleValueType(0);
4660 unsigned ElSize = VT.getScalarSizeInBits();
4661 bool Result = (Index * ElSize) % vecWidth == 0;
4666 bool X86::isVINSERT128Index(SDNode *N) {
4667 return isVINSERTIndex(N, 128);
4670 bool X86::isVINSERT256Index(SDNode *N) {
4671 return isVINSERTIndex(N, 256);
4674 bool X86::isVEXTRACT128Index(SDNode *N) {
4675 return isVEXTRACTIndex(N, 128);
4678 bool X86::isVEXTRACT256Index(SDNode *N) {
4679 return isVEXTRACTIndex(N, 256);
4682 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4683 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4684 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4685 "Illegal extract subvector for VEXTRACT");
4688 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4690 MVT VecVT = N->getOperand(0).getSimpleValueType();
4691 MVT ElVT = VecVT.getVectorElementType();
4693 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4694 return Index / NumElemsPerChunk;
4697 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4698 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4699 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4700 "Illegal insert subvector for VINSERT");
4703 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4705 MVT VecVT = N->getSimpleValueType(0);
4706 MVT ElVT = VecVT.getVectorElementType();
4708 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4709 return Index / NumElemsPerChunk;
4712 /// Return the appropriate immediate to extract the specified
4713 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4714 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4715 return getExtractVEXTRACTImmediate(N, 128);
4718 /// Return the appropriate immediate to extract the specified
4719 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4720 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4721 return getExtractVEXTRACTImmediate(N, 256);
4724 /// Return the appropriate immediate to insert at the specified
4725 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4726 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4727 return getInsertVINSERTImmediate(N, 128);
4730 /// Return the appropriate immediate to insert at the specified
4731 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4732 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4733 return getInsertVINSERTImmediate(N, 256);
4736 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4737 bool X86::isZeroNode(SDValue Elt) {
4738 return isNullConstant(Elt) || isNullFPConstant(Elt);
4741 // Build a vector of constants
4742 // Use an UNDEF node if MaskElt == -1.
4743 // Spilt 64-bit constants in the 32-bit mode.
4744 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4745 const SDLoc &dl, bool IsMask = false) {
4747 SmallVector<SDValue, 32> Ops;
4750 MVT ConstVecVT = VT;
4751 unsigned NumElts = VT.getVectorNumElements();
4752 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4753 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4754 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4758 MVT EltVT = ConstVecVT.getVectorElementType();
4759 for (unsigned i = 0; i < NumElts; ++i) {
4760 bool IsUndef = Values[i] < 0 && IsMask;
4761 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4762 DAG.getConstant(Values[i], dl, EltVT);
4763 Ops.push_back(OpNode);
4765 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4766 DAG.getConstant(0, dl, EltVT));
4768 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4770 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4774 static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
4775 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4776 assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
4777 SmallVector<SDValue, 32> Ops;
4780 MVT ConstVecVT = VT;
4781 unsigned NumElts = VT.getVectorNumElements();
4782 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4783 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4784 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4788 MVT EltVT = ConstVecVT.getVectorElementType();
4789 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4791 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4794 const APInt &V = Bits[i];
4795 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4797 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4798 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4799 } else if (EltVT == MVT::f32) {
4800 APFloat FV(APFloat::IEEEsingle(), V);
4801 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4802 } else if (EltVT == MVT::f64) {
4803 APFloat FV(APFloat::IEEEdouble(), V);
4804 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4806 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4810 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4811 return DAG.getBitcast(VT, ConstsNode);
4814 /// Returns a vector of specified type with all zero elements.
4815 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4816 SelectionDAG &DAG, const SDLoc &dl) {
4817 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4818 VT.getVectorElementType() == MVT::i1) &&
4819 "Unexpected vector type");
4821 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4822 // type. This ensures they get CSE'd. But if the integer type is not
4823 // available, use a floating-point +0.0 instead.
4825 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4826 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4827 } else if (VT.getVectorElementType() == MVT::i1) {
4828 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4829 "Unexpected vector type");
4830 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4831 "Unexpected vector type");
4832 Vec = DAG.getConstant(0, dl, VT);
4834 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4835 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4837 return DAG.getBitcast(VT, Vec);
4840 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4841 const SDLoc &dl, unsigned vectorWidth) {
4842 EVT VT = Vec.getValueType();
4843 EVT ElVT = VT.getVectorElementType();
4844 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4845 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4846 VT.getVectorNumElements()/Factor);
4848 // Extract from UNDEF is UNDEF.
4850 return DAG.getUNDEF(ResultVT);
4852 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4853 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4854 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4856 // This is the index of the first element of the vectorWidth-bit chunk
4857 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4858 IdxVal &= ~(ElemsPerChunk - 1);
4860 // If the input is a buildvector just emit a smaller one.
4861 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4862 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
4863 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4865 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4866 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4869 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4870 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4871 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4872 /// instructions or a simple subregister reference. Idx is an index in the
4873 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4874 /// lowering EXTRACT_VECTOR_ELT operations easier.
4875 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4876 SelectionDAG &DAG, const SDLoc &dl) {
4877 assert((Vec.getValueType().is256BitVector() ||
4878 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4879 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4882 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4883 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4884 SelectionDAG &DAG, const SDLoc &dl) {
4885 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4886 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4889 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4890 SelectionDAG &DAG, const SDLoc &dl,
4891 unsigned vectorWidth) {
4892 assert((vectorWidth == 128 || vectorWidth == 256) &&
4893 "Unsupported vector width");
4894 // Inserting UNDEF is Result
4897 EVT VT = Vec.getValueType();
4898 EVT ElVT = VT.getVectorElementType();
4899 EVT ResultVT = Result.getValueType();
4901 // Insert the relevant vectorWidth bits.
4902 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4903 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4905 // This is the index of the first element of the vectorWidth-bit chunk
4906 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4907 IdxVal &= ~(ElemsPerChunk - 1);
4909 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4910 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4913 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4914 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4915 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4916 /// simple superregister reference. Idx is an index in the 128 bits
4917 /// we want. It need not be aligned to a 128-bit boundary. That makes
4918 /// lowering INSERT_VECTOR_ELT operations easier.
4919 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4920 SelectionDAG &DAG, const SDLoc &dl) {
4921 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4923 // For insertion into the zero index (low half) of a 256-bit vector, it is
4924 // more efficient to generate a blend with immediate instead of an insert*128.
4925 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4926 // extend the subvector to the size of the result vector. Make sure that
4927 // we are not recursing on that node by checking for undef here.
4928 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4929 !Result.isUndef()) {
4930 EVT ResultVT = Result.getValueType();
4931 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4932 SDValue Undef = DAG.getUNDEF(ResultVT);
4933 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4936 // The blend instruction, and therefore its mask, depend on the data type.
4937 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4938 if (ScalarType.isFloatingPoint()) {
4939 // Choose either vblendps (float) or vblendpd (double).
4940 unsigned ScalarSize = ScalarType.getSizeInBits();
4941 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4942 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4943 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4944 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4947 const X86Subtarget &Subtarget =
4948 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4950 // AVX2 is needed for 256-bit integer blend support.
4951 // Integers must be cast to 32-bit because there is only vpblendd;
4952 // vpblendw can't be used for this because it has a handicapped mask.
4954 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4955 // is still more efficient than using the wrong domain vinsertf128 that
4956 // will be created by InsertSubVector().
4957 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4959 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4960 Result = DAG.getBitcast(CastVT, Result);
4961 Vec256 = DAG.getBitcast(CastVT, Vec256);
4962 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4963 return DAG.getBitcast(ResultVT, Vec256);
4966 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4969 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4970 SelectionDAG &DAG, const SDLoc &dl) {
4971 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4972 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4975 /// Insert i1-subvector to i1-vector.
4976 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4977 const X86Subtarget &Subtarget) {
4980 SDValue Vec = Op.getOperand(0);
4981 SDValue SubVec = Op.getOperand(1);
4982 SDValue Idx = Op.getOperand(2);
4984 if (!isa<ConstantSDNode>(Idx))
4987 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4988 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4991 MVT OpVT = Op.getSimpleValueType();
4992 MVT SubVecVT = SubVec.getSimpleValueType();
4993 unsigned NumElems = OpVT.getVectorNumElements();
4994 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4996 assert(IdxVal + SubVecNumElems <= NumElems &&
4997 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4998 "Unexpected index value in INSERT_SUBVECTOR");
5000 // There are 3 possible cases:
5001 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5002 // 2. Subvector should be inserted in the upper part
5003 // (IdxVal + SubVecNumElems == NumElems)
5004 // 3. Subvector should be inserted in the middle (for example v2i1
5005 // to v16i1, index 2)
5007 // extend to natively supported kshift
5008 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5009 MVT WideOpVT = OpVT;
5010 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5013 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5014 SDValue Undef = DAG.getUNDEF(WideOpVT);
5015 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5016 Undef, SubVec, ZeroIdx);
5018 // Extract sub-vector if require.
5019 auto ExtractSubVec = [&](SDValue V) {
5020 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5024 if (Vec.isUndef()) {
5026 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5027 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
5029 return ExtractSubVec(WideSubVec);
5032 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5033 NumElems = WideOpVT.getVectorNumElements();
5034 unsigned ShiftLeft = NumElems - SubVecNumElems;
5035 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5036 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5037 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5038 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
5039 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5040 return ExtractSubVec(Vec);
5044 // Zero lower bits of the Vec
5045 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5046 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5047 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5048 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5049 // Merge them together, SubVec should be zero extended.
5050 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5051 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5053 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5054 return ExtractSubVec(Vec);
5057 // Simple case when we put subvector in the upper part
5058 if (IdxVal + SubVecNumElems == NumElems) {
5059 // Zero upper bits of the Vec
5060 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5061 DAG.getConstant(IdxVal, dl, MVT::i8));
5062 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5063 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5064 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5065 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5066 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5067 return ExtractSubVec(Vec);
5069 // Subvector should be inserted in the middle - use shuffle
5070 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5072 SmallVector<int, 64> Mask;
5073 for (unsigned i = 0; i < NumElems; ++i)
5074 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5076 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5079 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5080 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5081 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5082 /// large BUILD_VECTORS.
5083 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5084 unsigned NumElems, SelectionDAG &DAG,
5086 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5087 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5090 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5091 unsigned NumElems, SelectionDAG &DAG,
5093 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5094 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5097 /// Returns a vector of specified type with all bits set.
5098 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5099 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
5100 /// Then bitcast to their original type, ensuring they get CSE'd.
5101 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
5102 SelectionDAG &DAG, const SDLoc &dl) {
5103 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5104 "Expected a 128/256/512-bit vector type");
5106 APInt Ones = APInt::getAllOnesValue(32);
5107 unsigned NumElts = VT.getSizeInBits() / 32;
5109 if (!Subtarget.hasInt256() && NumElts == 8) {
5110 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
5111 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5113 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5115 return DAG.getBitcast(VT, Vec);
5118 /// Generate unpacklo/unpackhi shuffle mask.
5119 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5121 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5122 int NumElts = VT.getVectorNumElements();
5123 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5125 for (int i = 0; i < NumElts; ++i) {
5126 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5127 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5128 Pos += (Unary ? 0 : NumElts * (i % 2));
5129 Pos += (Lo ? 0 : NumEltsInLane / 2);
5130 Mask.push_back(Pos);
5134 /// Returns a vector_shuffle node for an unpackl operation.
5135 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5136 SDValue V1, SDValue V2) {
5137 SmallVector<int, 8> Mask;
5138 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5139 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5142 /// Returns a vector_shuffle node for an unpackh operation.
5143 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5144 SDValue V1, SDValue V2) {
5145 SmallVector<int, 8> Mask;
5146 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5147 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5150 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5151 /// This produces a shuffle where the low element of V2 is swizzled into the
5152 /// zero/undef vector, landing at element Idx.
5153 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5154 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5156 const X86Subtarget &Subtarget,
5157 SelectionDAG &DAG) {
5158 MVT VT = V2.getSimpleValueType();
5160 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5161 int NumElems = VT.getVectorNumElements();
5162 SmallVector<int, 16> MaskVec(NumElems);
5163 for (int i = 0; i != NumElems; ++i)
5164 // If this is the insertion idx, put the low elt of V2 here.
5165 MaskVec[i] = (i == Idx) ? NumElems : i;
5166 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5169 static SDValue peekThroughBitcasts(SDValue V) {
5170 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5171 V = V.getOperand(0);
5175 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5176 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5177 V.getOperand(0).hasOneUse())
5178 V = V.getOperand(0);
5182 static const Constant *getTargetConstantFromNode(SDValue Op) {
5183 Op = peekThroughBitcasts(Op);
5185 auto *Load = dyn_cast<LoadSDNode>(Op);
5189 SDValue Ptr = Load->getBasePtr();
5190 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5191 Ptr->getOpcode() == X86ISD::WrapperRIP)
5192 Ptr = Ptr->getOperand(0);
5194 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5195 if (!CNode || CNode->isMachineConstantPoolEntry())
5198 return dyn_cast<Constant>(CNode->getConstVal());
5201 // Extract raw constant bits from constant pools.
5202 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5203 SmallBitVector &UndefElts,
5204 SmallVectorImpl<APInt> &EltBits) {
5205 assert(UndefElts.empty() && "Expected an empty UndefElts vector");
5206 assert(EltBits.empty() && "Expected an empty EltBits vector");
5208 Op = peekThroughBitcasts(Op);
5210 EVT VT = Op.getValueType();
5211 unsigned SizeInBits = VT.getSizeInBits();
5212 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5213 unsigned NumElts = SizeInBits / EltSizeInBits;
5215 // Extract all the undef/constant element data and pack into single bitsets.
5216 APInt UndefBits(SizeInBits, 0);
5217 APInt MaskBits(SizeInBits, 0);
5219 // Split the undef/constant single bitset data into the target elements.
5220 auto SplitBitData = [&]() {
5221 UndefElts = SmallBitVector(NumElts, false);
5222 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5224 for (unsigned i = 0; i != NumElts; ++i) {
5225 APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
5226 UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
5228 // Only treat an element as UNDEF if all bits are UNDEF, otherwise
5229 // treat it as zero.
5230 if (UndefEltBits.isAllOnesValue()) {
5231 UndefElts[i] = true;
5235 APInt Bits = MaskBits.lshr(i * EltSizeInBits);
5236 Bits = Bits.zextOrTrunc(EltSizeInBits);
5237 EltBits[i] = Bits.getZExtValue();
5242 auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
5246 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5247 if (isa<UndefValue>(Cst)) {
5248 Mask = APInt::getNullValue(SizeInBits);
5249 Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
5252 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5253 Mask = CInt->getValue().zextOrTrunc(SizeInBits);
5254 Undefs = APInt::getNullValue(SizeInBits);
5257 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5258 Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
5259 Undefs = APInt::getNullValue(SizeInBits);
5265 // Extract constant bits from constant pool vector.
5266 if (auto *Cst = getTargetConstantFromNode(Op)) {
5267 Type *CstTy = Cst->getType();
5268 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5271 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5272 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
5274 if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
5276 MaskBits |= Bits.shl(i * CstEltSizeInBits);
5277 UndefBits |= Undefs.shl(i * CstEltSizeInBits);
5280 return SplitBitData();
5283 // Extract constant bits from a broadcasted constant pool scalar.
5284 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5285 EltSizeInBits <= Op.getScalarValueSizeInBits()) {
5286 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5288 if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
5289 unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
5290 unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
5291 for (unsigned i = 0; i != NumBroadcastElts; ++i) {
5292 MaskBits |= Bits.shl(i * NumBroadcastBits);
5293 UndefBits |= Undefs.shl(i * NumBroadcastBits);
5295 return SplitBitData();
5303 // TODO: Merge more of this with getTargetConstantBitsFromNode.
5304 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5305 unsigned MaskEltSizeInBits,
5306 SmallVectorImpl<uint64_t> &RawMask) {
5307 MaskNode = peekThroughBitcasts(MaskNode);
5309 MVT VT = MaskNode.getSimpleValueType();
5310 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
5311 unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
5313 // Split an APInt element into MaskEltSizeInBits sized pieces and
5314 // insert into the shuffle mask.
5315 auto SplitElementToMask = [&](APInt Element) {
5316 // Note that this is x86 and so always little endian: the low byte is
5317 // the first byte of the mask.
5318 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5319 for (int i = 0; i < Split; ++i) {
5320 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
5321 Element = Element.lshr(MaskEltSizeInBits);
5322 RawMask.push_back(RawElt.getZExtValue());
5326 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
5327 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5328 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
5329 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
5331 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
5332 const APInt &MaskElement = CN->getAPIntValue();
5333 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
5334 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
5335 RawMask.push_back(RawElt.getZExtValue());
5341 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
5342 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
5343 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
5344 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
5345 if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
5346 RawMask.push_back(CN->getZExtValue());
5347 RawMask.append(NumMaskElts - 1, 0);
5351 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
5352 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5353 SplitElementToMask(CN->getAPIntValue());
5354 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
5361 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
5364 // We can always decode if the buildvector is all zero constants,
5365 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
5366 if (all_of(MaskNode->ops(), X86::isZeroNode)) {
5367 RawMask.append(NumMaskElts, 0);
5371 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5372 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
5375 for (SDValue Op : MaskNode->ops()) {
5376 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
5377 SplitElementToMask(CN->getAPIntValue());
5378 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
5379 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
5387 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5388 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5389 /// operands in \p Ops, and returns true.
5390 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5391 /// IsUnary for shuffles which use a single input multiple times, and in those
5392 /// cases it will adjust the mask to only have indices within that single input.
5393 /// It is an error to call this with non-empty Mask/Ops vectors.
5394 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5395 SmallVectorImpl<SDValue> &Ops,
5396 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5397 unsigned NumElems = VT.getVectorNumElements();
5400 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5401 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5404 bool IsFakeUnary = false;
5405 switch(N->getOpcode()) {
5406 case X86ISD::BLENDI:
5407 ImmN = N->getOperand(N->getNumOperands()-1);
5408 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5411 ImmN = N->getOperand(N->getNumOperands()-1);
5412 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5413 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5415 case X86ISD::INSERTPS:
5416 ImmN = N->getOperand(N->getNumOperands()-1);
5417 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5418 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5420 case X86ISD::UNPCKH:
5421 DecodeUNPCKHMask(VT, Mask);
5422 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5424 case X86ISD::UNPCKL:
5425 DecodeUNPCKLMask(VT, Mask);
5426 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5428 case X86ISD::MOVHLPS:
5429 DecodeMOVHLPSMask(NumElems, Mask);
5430 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5432 case X86ISD::MOVLHPS:
5433 DecodeMOVLHPSMask(NumElems, Mask);
5434 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5436 case X86ISD::PALIGNR:
5437 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5438 ImmN = N->getOperand(N->getNumOperands()-1);
5439 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5440 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5441 Ops.push_back(N->getOperand(1));
5442 Ops.push_back(N->getOperand(0));
5444 case X86ISD::VSHLDQ:
5445 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5446 ImmN = N->getOperand(N->getNumOperands() - 1);
5447 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5450 case X86ISD::VSRLDQ:
5451 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5452 ImmN = N->getOperand(N->getNumOperands() - 1);
5453 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5456 case X86ISD::PSHUFD:
5457 case X86ISD::VPERMILPI:
5458 ImmN = N->getOperand(N->getNumOperands()-1);
5459 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5462 case X86ISD::PSHUFHW:
5463 ImmN = N->getOperand(N->getNumOperands()-1);
5464 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5467 case X86ISD::PSHUFLW:
5468 ImmN = N->getOperand(N->getNumOperands()-1);
5469 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5472 case X86ISD::VZEXT_MOVL:
5473 DecodeZeroMoveLowMask(VT, Mask);
5476 case X86ISD::VBROADCAST: {
5477 // We only decode broadcasts of same-sized vectors at the moment.
5478 if (N->getOperand(0).getValueType() == VT) {
5479 DecodeVectorBroadcast(VT, Mask);
5485 case X86ISD::VPERMILPV: {
5487 SDValue MaskNode = N->getOperand(1);
5488 unsigned MaskEltSize = VT.getScalarSizeInBits();
5489 SmallVector<uint64_t, 32> RawMask;
5490 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5491 DecodeVPERMILPMask(VT, RawMask, Mask);
5494 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5495 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5500 case X86ISD::PSHUFB: {
5502 SDValue MaskNode = N->getOperand(1);
5503 SmallVector<uint64_t, 32> RawMask;
5504 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5505 DecodePSHUFBMask(RawMask, Mask);
5508 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5509 DecodePSHUFBMask(C, Mask);
5514 case X86ISD::VPERMI:
5515 ImmN = N->getOperand(N->getNumOperands()-1);
5516 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5521 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5523 case X86ISD::VPERM2X128:
5524 ImmN = N->getOperand(N->getNumOperands()-1);
5525 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5526 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5528 case X86ISD::MOVSLDUP:
5529 DecodeMOVSLDUPMask(VT, Mask);
5532 case X86ISD::MOVSHDUP:
5533 DecodeMOVSHDUPMask(VT, Mask);
5536 case X86ISD::MOVDDUP:
5537 DecodeMOVDDUPMask(VT, Mask);
5540 case X86ISD::MOVLHPD:
5541 case X86ISD::MOVLPD:
5542 case X86ISD::MOVLPS:
5543 // Not yet implemented
5545 case X86ISD::VPERMIL2: {
5546 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5547 unsigned MaskEltSize = VT.getScalarSizeInBits();
5548 SDValue MaskNode = N->getOperand(2);
5549 SDValue CtrlNode = N->getOperand(3);
5550 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5551 unsigned CtrlImm = CtrlOp->getZExtValue();
5552 SmallVector<uint64_t, 32> RawMask;
5553 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5554 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5557 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5558 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5564 case X86ISD::VPPERM: {
5565 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5566 SDValue MaskNode = N->getOperand(2);
5567 SmallVector<uint64_t, 32> RawMask;
5568 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5569 DecodeVPPERMMask(RawMask, Mask);
5572 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5573 DecodeVPPERMMask(C, Mask);
5578 case X86ISD::VPERMV: {
5580 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5581 Ops.push_back(N->getOperand(1));
5582 SDValue MaskNode = N->getOperand(0);
5583 SmallVector<uint64_t, 32> RawMask;
5584 unsigned MaskEltSize = VT.getScalarSizeInBits();
5585 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5586 DecodeVPERMVMask(RawMask, Mask);
5589 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5590 DecodeVPERMVMask(C, MaskEltSize, Mask);
5595 case X86ISD::VPERMV3: {
5596 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5597 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5598 Ops.push_back(N->getOperand(0));
5599 Ops.push_back(N->getOperand(2));
5600 SDValue MaskNode = N->getOperand(1);
5601 unsigned MaskEltSize = VT.getScalarSizeInBits();
5602 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5603 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5608 case X86ISD::VPERMIV3: {
5609 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5610 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5611 Ops.push_back(N->getOperand(1));
5612 Ops.push_back(N->getOperand(2));
5613 SDValue MaskNode = N->getOperand(0);
5614 unsigned MaskEltSize = VT.getScalarSizeInBits();
5615 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5616 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5621 default: llvm_unreachable("unknown target shuffle node");
5624 // Empty mask indicates the decode failed.
5628 // Check if we're getting a shuffle mask with zero'd elements.
5629 if (!AllowSentinelZero)
5630 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5633 // If we have a fake unary shuffle, the shuffle mask is spread across two
5634 // inputs that are actually the same node. Re-map the mask to always point
5635 // into the first input.
5638 if (M >= (int)Mask.size())
5641 // If we didn't already add operands in the opcode-specific code, default to
5642 // adding 1 or 2 operands starting at 0.
5644 Ops.push_back(N->getOperand(0));
5645 if (!IsUnary || IsFakeUnary)
5646 Ops.push_back(N->getOperand(1));
5652 /// Check a target shuffle mask's inputs to see if we can set any values to
5653 /// SM_SentinelZero - this is for elements that are known to be zero
5654 /// (not just zeroable) from their inputs.
5655 /// Returns true if the target shuffle mask was decoded.
5656 static bool setTargetShuffleZeroElements(SDValue N,
5657 SmallVectorImpl<int> &Mask,
5658 SmallVectorImpl<SDValue> &Ops) {
5660 if (!isTargetShuffle(N.getOpcode()))
5663 MVT VT = N.getSimpleValueType();
5664 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5667 SDValue V1 = Ops[0];
5668 SDValue V2 = IsUnary ? V1 : Ops[1];
5670 V1 = peekThroughBitcasts(V1);
5671 V2 = peekThroughBitcasts(V2);
5673 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5676 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5680 // Determine shuffle input and normalize the mask.
5681 SDValue V = M < Size ? V1 : V2;
5684 // We are referencing an UNDEF input.
5686 Mask[i] = SM_SentinelUndef;
5690 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5691 if (V.getOpcode() != ISD::BUILD_VECTOR)
5694 // If the BUILD_VECTOR has fewer elements then the (larger) source
5695 // element must be UNDEF/ZERO.
5696 // TODO: Is it worth testing the individual bits of a constant?
5697 if ((Size % V.getNumOperands()) == 0) {
5698 int Scale = Size / V->getNumOperands();
5699 SDValue Op = V.getOperand(M / Scale);
5701 Mask[i] = SM_SentinelUndef;
5702 else if (X86::isZeroNode(Op))
5703 Mask[i] = SM_SentinelZero;
5707 // If the BUILD_VECTOR has more elements then all the (smaller) source
5708 // elements must be all UNDEF or all ZERO.
5709 if ((V.getNumOperands() % Size) == 0) {
5710 int Scale = V->getNumOperands() / Size;
5711 bool AllUndef = true;
5712 bool AllZero = true;
5713 for (int j = 0; j < Scale; ++j) {
5714 SDValue Op = V.getOperand((M * Scale) + j);
5715 AllUndef &= Op.isUndef();
5716 AllZero &= X86::isZeroNode(Op);
5719 Mask[i] = SM_SentinelUndef;
5721 Mask[i] = SM_SentinelZero;
5726 assert(VT.getVectorNumElements() == Mask.size() &&
5727 "Different mask size from vector size!");
5731 // Attempt to decode ops that could be represented as a shuffle mask.
5732 // The decoded shuffle mask may contain a different number of elements to the
5733 // destination value type.
5734 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5735 SmallVectorImpl<SDValue> &Ops) {
5739 MVT VT = N.getSimpleValueType();
5740 unsigned NumElts = VT.getVectorNumElements();
5741 unsigned NumSizeInBits = VT.getSizeInBits();
5742 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5743 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5744 "Expected byte aligned value types");
5746 unsigned Opcode = N.getOpcode();
5749 // Attempt to decode as a per-byte mask.
5750 SmallBitVector UndefElts;
5751 SmallVector<APInt, 32> EltBits;
5752 if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
5754 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5756 Mask.push_back(SM_SentinelUndef);
5759 uint64_t ByteBits = EltBits[i].getZExtValue();
5760 if (ByteBits != 0 && ByteBits != 255)
5762 Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
5764 Ops.push_back(N.getOperand(0));
5768 case X86ISD::VSRLI: {
5769 uint64_t ShiftVal = N.getConstantOperandVal(1);
5770 // Out of range bit shifts are guaranteed to be zero.
5771 if (NumBitsPerElt <= ShiftVal) {
5772 Mask.append(NumElts, SM_SentinelZero);
5776 // We can only decode 'whole byte' bit shifts as shuffles.
5777 if ((ShiftVal % 8) != 0)
5780 uint64_t ByteShift = ShiftVal / 8;
5781 unsigned NumBytes = NumSizeInBits / 8;
5782 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5783 Ops.push_back(N.getOperand(0));
5785 // Clear mask to all zeros and insert the shifted byte indices.
5786 Mask.append(NumBytes, SM_SentinelZero);
5788 if (X86ISD::VSHLI == Opcode) {
5789 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5790 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5791 Mask[i + j] = i + j - ByteShift;
5793 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5794 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5795 Mask[i + j - ByteShift] = i + j;
5799 case X86ISD::VZEXT: {
5800 // TODO - add support for VPMOVZX with smaller input vector types.
5801 SDValue Src = N.getOperand(0);
5802 MVT SrcVT = Src.getSimpleValueType();
5803 if (NumSizeInBits != SrcVT.getSizeInBits())
5805 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5814 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5815 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5816 /// remaining input indices in case we now have a unary shuffle and adjust the
5817 /// Op0/Op1 inputs accordingly.
5818 /// Returns true if the target shuffle mask was decoded.
5819 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5820 SmallVectorImpl<int> &Mask) {
5821 SmallVector<SDValue, 2> Ops;
5822 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5823 if (!getFauxShuffleMask(Op, Mask, Ops))
5826 int NumElts = Mask.size();
5827 bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
5828 return 0 <= Idx && Idx < NumElts;
5830 bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
5832 Op0 = Op0InUse ? Ops[0] : SDValue();
5833 Op1 = Op1InUse ? Ops[1] : SDValue();
5835 // We're only using Op1 - commute the mask and inputs.
5836 if (!Op0InUse && Op1InUse) {
5847 /// Returns the scalar element that will make up the ith
5848 /// element of the result of the vector shuffle.
5849 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5852 return SDValue(); // Limit search depth.
5854 SDValue V = SDValue(N, 0);
5855 EVT VT = V.getValueType();
5856 unsigned Opcode = V.getOpcode();
5858 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5859 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5860 int Elt = SV->getMaskElt(Index);
5863 return DAG.getUNDEF(VT.getVectorElementType());
5865 unsigned NumElems = VT.getVectorNumElements();
5866 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5867 : SV->getOperand(1);
5868 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5871 // Recurse into target specific vector shuffles to find scalars.
5872 if (isTargetShuffle(Opcode)) {
5873 MVT ShufVT = V.getSimpleValueType();
5874 MVT ShufSVT = ShufVT.getVectorElementType();
5875 int NumElems = (int)ShufVT.getVectorNumElements();
5876 SmallVector<int, 16> ShuffleMask;
5877 SmallVector<SDValue, 16> ShuffleOps;
5880 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5883 int Elt = ShuffleMask[Index];
5884 if (Elt == SM_SentinelZero)
5885 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5886 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5887 if (Elt == SM_SentinelUndef)
5888 return DAG.getUNDEF(ShufSVT);
5890 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5891 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5892 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5896 // Actual nodes that may contain scalar elements
5897 if (Opcode == ISD::BITCAST) {
5898 V = V.getOperand(0);
5899 EVT SrcVT = V.getValueType();
5900 unsigned NumElems = VT.getVectorNumElements();
5902 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5906 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5907 return (Index == 0) ? V.getOperand(0)
5908 : DAG.getUNDEF(VT.getVectorElementType());
5910 if (V.getOpcode() == ISD::BUILD_VECTOR)
5911 return V.getOperand(Index);
5916 /// Custom lower build_vector of v16i8.
5917 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5918 unsigned NumNonZero, unsigned NumZero,
5920 const X86Subtarget &Subtarget,
5921 const TargetLowering &TLI) {
5929 // SSE4.1 - use PINSRB to insert each byte directly.
5930 if (Subtarget.hasSSE41()) {
5931 for (unsigned i = 0; i < 16; ++i) {
5932 bool isNonZero = (NonZeros & (1 << i)) != 0;
5936 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5938 V = DAG.getUNDEF(MVT::v16i8);
5941 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5942 MVT::v16i8, V, Op.getOperand(i),
5943 DAG.getIntPtrConstant(i, dl));
5950 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5951 for (unsigned i = 0; i < 16; ++i) {
5952 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5953 if (ThisIsNonZero && First) {
5955 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5957 V = DAG.getUNDEF(MVT::v8i16);
5962 SDValue ThisElt, LastElt;
5963 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5964 if (LastIsNonZero) {
5965 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5966 MVT::i16, Op.getOperand(i-1));
5968 if (ThisIsNonZero) {
5969 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5970 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5971 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5973 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5977 if (ThisElt.getNode())
5978 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5979 DAG.getIntPtrConstant(i/2, dl));
5983 return DAG.getBitcast(MVT::v16i8, V);
5986 /// Custom lower build_vector of v8i16.
5987 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5988 unsigned NumNonZero, unsigned NumZero,
5990 const X86Subtarget &Subtarget,
5991 const TargetLowering &TLI) {
5998 for (unsigned i = 0; i < 8; ++i) {
5999 bool isNonZero = (NonZeros & (1 << i)) != 0;
6003 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6005 V = DAG.getUNDEF(MVT::v8i16);
6008 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
6009 MVT::v8i16, V, Op.getOperand(i),
6010 DAG.getIntPtrConstant(i, dl));
6017 /// Custom lower build_vector of v4i32 or v4f32.
6018 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6019 const X86Subtarget &Subtarget,
6020 const TargetLowering &TLI) {
6021 // Find all zeroable elements.
6022 std::bitset<4> Zeroable;
6023 for (int i=0; i < 4; ++i) {
6024 SDValue Elt = Op->getOperand(i);
6025 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6027 assert(Zeroable.size() - Zeroable.count() > 1 &&
6028 "We expect at least two non-zero elements!");
6030 // We only know how to deal with build_vector nodes where elements are either
6031 // zeroable or extract_vector_elt with constant index.
6032 SDValue FirstNonZero;
6033 unsigned FirstNonZeroIdx;
6034 for (unsigned i=0; i < 4; ++i) {
6037 SDValue Elt = Op->getOperand(i);
6038 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6039 !isa<ConstantSDNode>(Elt.getOperand(1)))
6041 // Make sure that this node is extracting from a 128-bit vector.
6042 MVT VT = Elt.getOperand(0).getSimpleValueType();
6043 if (!VT.is128BitVector())
6045 if (!FirstNonZero.getNode()) {
6047 FirstNonZeroIdx = i;
6051 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6052 SDValue V1 = FirstNonZero.getOperand(0);
6053 MVT VT = V1.getSimpleValueType();
6055 // See if this build_vector can be lowered as a blend with zero.
6057 unsigned EltMaskIdx, EltIdx;
6059 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6060 if (Zeroable[EltIdx]) {
6061 // The zero vector will be on the right hand side.
6062 Mask[EltIdx] = EltIdx+4;
6066 Elt = Op->getOperand(EltIdx);
6067 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6068 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6069 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6071 Mask[EltIdx] = EltIdx;
6075 // Let the shuffle legalizer deal with blend operations.
6076 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6077 if (V1.getSimpleValueType() != VT)
6078 V1 = DAG.getBitcast(VT, V1);
6079 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6082 // See if we can lower this build_vector to a INSERTPS.
6083 if (!Subtarget.hasSSE41())
6086 SDValue V2 = Elt.getOperand(0);
6087 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6090 bool CanFold = true;
6091 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6095 SDValue Current = Op->getOperand(i);
6096 SDValue SrcVector = Current->getOperand(0);
6099 CanFold = SrcVector == V1 &&
6100 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6106 assert(V1.getNode() && "Expected at least two non-zero elements!");
6107 if (V1.getSimpleValueType() != MVT::v4f32)
6108 V1 = DAG.getBitcast(MVT::v4f32, V1);
6109 if (V2.getSimpleValueType() != MVT::v4f32)
6110 V2 = DAG.getBitcast(MVT::v4f32, V2);
6112 // Ok, we can emit an INSERTPS instruction.
6113 unsigned ZMask = Zeroable.to_ulong();
6115 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6116 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6118 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6119 DAG.getIntPtrConstant(InsertPSMask, DL));
6120 return DAG.getBitcast(VT, Result);
6123 /// Return a vector logical shift node.
6124 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6125 SelectionDAG &DAG, const TargetLowering &TLI,
6127 assert(VT.is128BitVector() && "Unknown type for VShift");
6128 MVT ShVT = MVT::v16i8;
6129 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6130 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6131 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6132 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6133 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6134 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6137 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6138 SelectionDAG &DAG) {
6140 // Check if the scalar load can be widened into a vector load. And if
6141 // the address is "base + cst" see if the cst can be "absorbed" into
6142 // the shuffle mask.
6143 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6144 SDValue Ptr = LD->getBasePtr();
6145 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6147 EVT PVT = LD->getValueType(0);
6148 if (PVT != MVT::i32 && PVT != MVT::f32)
6153 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6154 FI = FINode->getIndex();
6156 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6157 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6158 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6159 Offset = Ptr.getConstantOperandVal(1);
6160 Ptr = Ptr.getOperand(0);
6165 // FIXME: 256-bit vector instructions don't require a strict alignment,
6166 // improve this code to support it better.
6167 unsigned RequiredAlign = VT.getSizeInBits()/8;
6168 SDValue Chain = LD->getChain();
6169 // Make sure the stack object alignment is at least 16 or 32.
6170 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6171 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6172 if (MFI.isFixedObjectIndex(FI)) {
6173 // Can't change the alignment. FIXME: It's possible to compute
6174 // the exact stack offset and reference FI + adjust offset instead.
6175 // If someone *really* cares about this. That's the way to implement it.
6178 MFI.setObjectAlignment(FI, RequiredAlign);
6182 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6183 // Ptr + (Offset & ~15).
6186 if ((Offset % RequiredAlign) & 3)
6188 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6191 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6192 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6195 int EltNo = (Offset - StartOffset) >> 2;
6196 unsigned NumElems = VT.getVectorNumElements();
6198 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6199 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6200 LD->getPointerInfo().getWithOffset(StartOffset));
6202 SmallVector<int, 8> Mask(NumElems, EltNo);
6204 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6210 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6211 /// elements can be replaced by a single large load which has the same value as
6212 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6214 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6215 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6216 SDLoc &DL, SelectionDAG &DAG,
6217 bool isAfterLegalize) {
6218 unsigned NumElems = Elts.size();
6220 int LastLoadedElt = -1;
6221 SmallBitVector LoadMask(NumElems, false);
6222 SmallBitVector ZeroMask(NumElems, false);
6223 SmallBitVector UndefMask(NumElems, false);
6225 // For each element in the initializer, see if we've found a load, zero or an
6227 for (unsigned i = 0; i < NumElems; ++i) {
6228 SDValue Elt = peekThroughBitcasts(Elts[i]);
6233 UndefMask[i] = true;
6234 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6236 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6239 // Each loaded element must be the correct fractional portion of the
6240 // requested vector load.
6241 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6246 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6247 "Incomplete element masks");
6249 // Handle Special Cases - all undef or undef/zero.
6250 if (UndefMask.count() == NumElems)
6251 return DAG.getUNDEF(VT);
6253 // FIXME: Should we return this as a BUILD_VECTOR instead?
6254 if ((ZeroMask | UndefMask).count() == NumElems)
6255 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6256 : DAG.getConstantFP(0.0, DL, VT);
6258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6259 int FirstLoadedElt = LoadMask.find_first();
6260 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6261 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6262 EVT LDBaseVT = EltBase.getValueType();
6264 // Consecutive loads can contain UNDEFS but not ZERO elements.
6265 // Consecutive loads with UNDEFs and ZEROs elements require a
6266 // an additional shuffle stage to clear the ZERO elements.
6267 bool IsConsecutiveLoad = true;
6268 bool IsConsecutiveLoadWithZeros = true;
6269 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6271 SDValue Elt = peekThroughBitcasts(Elts[i]);
6272 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6273 if (!DAG.areNonVolatileConsecutiveLoads(
6274 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6275 i - FirstLoadedElt)) {
6276 IsConsecutiveLoad = false;
6277 IsConsecutiveLoadWithZeros = false;
6280 } else if (ZeroMask[i]) {
6281 IsConsecutiveLoad = false;
6285 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6286 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6287 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6288 "Cannot merge volatile loads.");
6290 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6291 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6293 if (LDBase->hasAnyUseOfValue(1)) {
6295 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6296 SDValue(NewLd.getNode(), 1));
6297 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6298 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6299 SDValue(NewLd.getNode(), 1));
6305 // LOAD - all consecutive load/undefs (must start/end with a load).
6306 // If we have found an entire vector of loads and undefs, then return a large
6307 // load of the entire vector width starting at the base pointer.
6308 // If the vector contains zeros, then attempt to shuffle those elements.
6309 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6310 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6311 assert(LDBase && "Did not find base load for merging consecutive loads");
6312 EVT EltVT = LDBase->getValueType(0);
6313 // Ensure that the input vector size for the merged loads matches the
6314 // cumulative size of the input elements.
6315 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6318 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6321 if (IsConsecutiveLoad)
6322 return CreateLoad(VT, LDBase);
6324 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6325 // vector and a zero vector to clear out the zero elements.
6326 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6327 SmallVector<int, 4> ClearMask(NumElems, -1);
6328 for (unsigned i = 0; i < NumElems; ++i) {
6330 ClearMask[i] = i + NumElems;
6331 else if (LoadMask[i])
6334 SDValue V = CreateLoad(VT, LDBase);
6335 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6336 : DAG.getConstantFP(0.0, DL, VT);
6337 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6342 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6344 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6345 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6346 (LoadSize == 32 || LoadSize == 64) &&
6347 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6348 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6349 : MVT::getIntegerVT(LoadSize);
6350 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6351 if (TLI.isTypeLegal(VecVT)) {
6352 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6353 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6355 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6356 LDBase->getPointerInfo(),
6357 LDBase->getAlignment(),
6358 false/*isVolatile*/, true/*ReadMem*/,
6361 // Make sure the newly-created LOAD is in the same position as LDBase in
6362 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6363 // and update uses of LDBase's output chain to use the TokenFactor.
6364 if (LDBase->hasAnyUseOfValue(1)) {
6366 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6367 SDValue(ResNode.getNode(), 1));
6368 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6369 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6370 SDValue(ResNode.getNode(), 1));
6373 return DAG.getBitcast(VT, ResNode);
6380 static Constant *getConstantVector(MVT VT, APInt SplatValue,
6381 unsigned SplatBitSize, LLVMContext &C) {
6382 unsigned ScalarSize = VT.getScalarSizeInBits();
6383 unsigned NumElm = SplatBitSize / ScalarSize;
6385 SmallVector<Constant *, 32> ConstantVec;
6386 for (unsigned i = 0; i < NumElm; i++) {
6387 APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
6389 if (VT.isFloatingPoint()) {
6390 assert((ScalarSize == 32 || ScalarSize == 64) &&
6391 "Unsupported floating point scalar size");
6392 if (ScalarSize == 32)
6393 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6395 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6397 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6398 ConstantVec.push_back(Const);
6400 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6403 static bool isUseOfShuffle(SDNode *N) {
6404 for (auto *U : N->uses()) {
6405 if (isTargetShuffle(U->getOpcode()))
6407 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6408 return isUseOfShuffle(U);
6413 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6414 /// following cases:
6415 /// 1. A splat BUILD_VECTOR which uses:
6416 /// a. A single scalar load, or a constant.
6417 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6418 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6419 /// a scalar load, or a constant.
6421 /// The VBROADCAST node is returned when a pattern is found,
6422 /// or SDValue() otherwise.
6423 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6424 SelectionDAG &DAG) {
6425 // VBROADCAST requires AVX.
6426 // TODO: Splats could be generated for non-AVX CPUs using SSE
6427 // instructions, but there's less potential gain for only 128-bit vectors.
6428 if (!Subtarget.hasAVX())
6431 MVT VT = BVOp->getSimpleValueType(0);
6434 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6435 "Unsupported vector type for broadcast.");
6437 BitVector UndefElements;
6438 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6440 // We need a splat of a single value to use broadcast, and it doesn't
6441 // make any sense if the value is only in one element of the vector.
6442 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6443 APInt SplatValue, Undef;
6444 unsigned SplatBitSize;
6446 // Check if this is a repeated constant pattern suitable for broadcasting.
6447 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6448 SplatBitSize > VT.getScalarSizeInBits() &&
6449 SplatBitSize < VT.getSizeInBits()) {
6450 // Avoid replacing with broadcast when it's a use of a shuffle
6451 // instruction to preserve the present custom lowering of shuffles.
6452 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6454 // replace BUILD_VECTOR with broadcast of the repeated constants.
6455 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6456 LLVMContext *Ctx = DAG.getContext();
6457 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6458 if (Subtarget.hasAVX()) {
6459 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6460 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6461 // Splatted value can fit in one INTEGER constant in constant pool.
6462 // Load the constant and broadcast it.
6463 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6464 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6465 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6466 SDValue CP = DAG.getConstantPool(C, PVT);
6467 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6469 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6471 CVT, dl, DAG.getEntryNode(), CP,
6472 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6474 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6475 MVT::getVectorVT(CVT, Repeat), Ld);
6476 return DAG.getBitcast(VT, Brdcst);
6477 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6478 // Splatted value can fit in one FLOAT constant in constant pool.
6479 // Load the constant and broadcast it.
6480 // AVX have support for 32 and 64 bit broadcast for floats only.
6481 // No 64bit integer in 32bit subtarget.
6482 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6483 Constant *C = SplatBitSize == 32
6484 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6485 SplatValue.bitsToFloat())
6486 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6487 SplatValue.bitsToDouble());
6488 SDValue CP = DAG.getConstantPool(C, PVT);
6489 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6491 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6493 CVT, dl, DAG.getEntryNode(), CP,
6494 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6496 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6497 MVT::getVectorVT(CVT, Repeat), Ld);
6498 return DAG.getBitcast(VT, Brdcst);
6499 } else if (SplatBitSize > 64) {
6500 // Load the vector of constants and broadcast it.
6501 MVT CVT = VT.getScalarType();
6502 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6504 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6505 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6506 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6508 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6509 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6511 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6512 return DAG.getBitcast(VT, Brdcst);
6519 bool ConstSplatVal =
6520 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6522 // Make sure that all of the users of a non-constant load are from the
6523 // BUILD_VECTOR node.
6524 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6527 unsigned ScalarSize = Ld.getValueSizeInBits();
6528 bool IsGE256 = (VT.getSizeInBits() >= 256);
6530 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6531 // instruction to save 8 or more bytes of constant pool data.
6532 // TODO: If multiple splats are generated to load the same constant,
6533 // it may be detrimental to overall size. There needs to be a way to detect
6534 // that condition to know if this is truly a size win.
6535 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6537 // Handle broadcasting a single constant scalar from the constant pool
6539 // On Sandybridge (no AVX2), it is still better to load a constant vector
6540 // from the constant pool and not to broadcast it from a scalar.
6541 // But override that restriction when optimizing for size.
6542 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6543 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6544 EVT CVT = Ld.getValueType();
6545 assert(!CVT.isVector() && "Must not broadcast a vector type");
6547 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6548 // For size optimization, also splat v2f64 and v2i64, and for size opt
6549 // with AVX2, also splat i8 and i16.
6550 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6551 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6552 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6553 const Constant *C = nullptr;
6554 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6555 C = CI->getConstantIntValue();
6556 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6557 C = CF->getConstantFPValue();
6559 assert(C && "Invalid constant type");
6561 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6563 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6564 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6566 CVT, dl, DAG.getEntryNode(), CP,
6567 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6570 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6574 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6576 // Handle AVX2 in-register broadcasts.
6577 if (!IsLoad && Subtarget.hasInt256() &&
6578 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6579 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6581 // The scalar source must be a normal load.
6585 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6586 (Subtarget.hasVLX() && ScalarSize == 64))
6587 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6589 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6590 // double since there is no vbroadcastsd xmm
6591 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6592 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6593 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6596 // Unsupported broadcast.
6600 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6601 /// underlying vector and index.
6603 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6605 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6607 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6608 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6611 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6613 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6615 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6616 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6619 // In this case the vector is the extract_subvector expression and the index
6620 // is 2, as specified by the shuffle.
6621 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6622 SDValue ShuffleVec = SVOp->getOperand(0);
6623 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6624 assert(ShuffleVecVT.getVectorElementType() ==
6625 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6627 int ShuffleIdx = SVOp->getMaskElt(Idx);
6628 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6629 ExtractedFromVec = ShuffleVec;
6635 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6636 MVT VT = Op.getSimpleValueType();
6638 // Skip if insert_vec_elt is not supported.
6639 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6640 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6644 unsigned NumElems = Op.getNumOperands();
6648 SmallVector<unsigned, 4> InsertIndices;
6649 SmallVector<int, 8> Mask(NumElems, -1);
6651 for (unsigned i = 0; i != NumElems; ++i) {
6652 unsigned Opc = Op.getOperand(i).getOpcode();
6654 if (Opc == ISD::UNDEF)
6657 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6658 // Quit if more than 1 elements need inserting.
6659 if (InsertIndices.size() > 1)
6662 InsertIndices.push_back(i);
6666 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6667 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6668 // Quit if non-constant index.
6669 if (!isa<ConstantSDNode>(ExtIdx))
6671 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6673 // Quit if extracted from vector of different type.
6674 if (ExtractedFromVec.getValueType() != VT)
6677 if (!VecIn1.getNode())
6678 VecIn1 = ExtractedFromVec;
6679 else if (VecIn1 != ExtractedFromVec) {
6680 if (!VecIn2.getNode())
6681 VecIn2 = ExtractedFromVec;
6682 else if (VecIn2 != ExtractedFromVec)
6683 // Quit if more than 2 vectors to shuffle
6687 if (ExtractedFromVec == VecIn1)
6689 else if (ExtractedFromVec == VecIn2)
6690 Mask[i] = Idx + NumElems;
6693 if (!VecIn1.getNode())
6696 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6697 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6698 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6699 unsigned Idx = InsertIndices[i];
6700 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6701 DAG.getIntPtrConstant(Idx, DL));
6707 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6708 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6709 Op.getScalarValueSizeInBits() == 1 &&
6710 "Can not convert non-constant vector");
6711 uint64_t Immediate = 0;
6712 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6713 SDValue In = Op.getOperand(idx);
6715 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6718 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6719 return DAG.getConstant(Immediate, dl, VT);
6721 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6723 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6725 MVT VT = Op.getSimpleValueType();
6726 assert((VT.getVectorElementType() == MVT::i1) &&
6727 "Unexpected type in LowerBUILD_VECTORvXi1!");
6730 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6731 return DAG.getTargetConstant(0, dl, VT);
6733 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6734 return DAG.getTargetConstant(1, dl, VT);
6736 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6737 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6738 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6739 return DAG.getBitcast(VT, Imm);
6740 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6741 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6742 DAG.getIntPtrConstant(0, dl));
6745 // Vector has one or more non-const elements
6746 uint64_t Immediate = 0;
6747 SmallVector<unsigned, 16> NonConstIdx;
6748 bool IsSplat = true;
6749 bool HasConstElts = false;
6751 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6752 SDValue In = Op.getOperand(idx);
6755 if (!isa<ConstantSDNode>(In))
6756 NonConstIdx.push_back(idx);
6758 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6759 HasConstElts = true;
6763 else if (In != Op.getOperand(SplatIdx))
6767 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6769 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6770 DAG.getConstant(1, dl, VT),
6771 DAG.getConstant(0, dl, VT));
6773 // insert elements one by one
6777 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6778 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6780 else if (HasConstElts)
6781 Imm = DAG.getConstant(0, dl, VT);
6783 Imm = DAG.getUNDEF(VT);
6784 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6785 DstVec = DAG.getBitcast(VT, Imm);
6787 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6788 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6789 DAG.getIntPtrConstant(0, dl));
6792 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6793 unsigned InsertIdx = NonConstIdx[i];
6794 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6795 Op.getOperand(InsertIdx),
6796 DAG.getIntPtrConstant(InsertIdx, dl));
6801 /// \brief Return true if \p N implements a horizontal binop and return the
6802 /// operands for the horizontal binop into V0 and V1.
6804 /// This is a helper function of LowerToHorizontalOp().
6805 /// This function checks that the build_vector \p N in input implements a
6806 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6807 /// operation to match.
6808 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6809 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6810 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6813 /// This function only analyzes elements of \p N whose indices are
6814 /// in range [BaseIdx, LastIdx).
6815 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6817 unsigned BaseIdx, unsigned LastIdx,
6818 SDValue &V0, SDValue &V1) {
6819 EVT VT = N->getValueType(0);
6821 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6822 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6823 "Invalid Vector in input!");
6825 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6826 bool CanFold = true;
6827 unsigned ExpectedVExtractIdx = BaseIdx;
6828 unsigned NumElts = LastIdx - BaseIdx;
6829 V0 = DAG.getUNDEF(VT);
6830 V1 = DAG.getUNDEF(VT);
6832 // Check if N implements a horizontal binop.
6833 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6834 SDValue Op = N->getOperand(i + BaseIdx);
6837 if (Op->isUndef()) {
6838 // Update the expected vector extract index.
6839 if (i * 2 == NumElts)
6840 ExpectedVExtractIdx = BaseIdx;
6841 ExpectedVExtractIdx += 2;
6845 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6850 SDValue Op0 = Op.getOperand(0);
6851 SDValue Op1 = Op.getOperand(1);
6853 // Try to match the following pattern:
6854 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6855 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6856 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6857 Op0.getOperand(0) == Op1.getOperand(0) &&
6858 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6859 isa<ConstantSDNode>(Op1.getOperand(1)));
6863 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6864 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6866 if (i * 2 < NumElts) {
6868 V0 = Op0.getOperand(0);
6869 if (V0.getValueType() != VT)
6874 V1 = Op0.getOperand(0);
6875 if (V1.getValueType() != VT)
6878 if (i * 2 == NumElts)
6879 ExpectedVExtractIdx = BaseIdx;
6882 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6883 if (I0 == ExpectedVExtractIdx)
6884 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6885 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6886 // Try to match the following dag sequence:
6887 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6888 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6892 ExpectedVExtractIdx += 2;
6898 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6899 /// a concat_vector.
6901 /// This is a helper function of LowerToHorizontalOp().
6902 /// This function expects two 256-bit vectors called V0 and V1.
6903 /// At first, each vector is split into two separate 128-bit vectors.
6904 /// Then, the resulting 128-bit vectors are used to implement two
6905 /// horizontal binary operations.
6907 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6909 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6910 /// the two new horizontal binop.
6911 /// When Mode is set, the first horizontal binop dag node would take as input
6912 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6913 /// horizontal binop dag node would take as input the lower 128-bit of V1
6914 /// and the upper 128-bit of V1.
6916 /// HADD V0_LO, V0_HI
6917 /// HADD V1_LO, V1_HI
6919 /// Otherwise, the first horizontal binop dag node takes as input the lower
6920 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6921 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6923 /// HADD V0_LO, V1_LO
6924 /// HADD V0_HI, V1_HI
6926 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6927 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6928 /// the upper 128-bits of the result.
6929 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6930 const SDLoc &DL, SelectionDAG &DAG,
6931 unsigned X86Opcode, bool Mode,
6932 bool isUndefLO, bool isUndefHI) {
6933 MVT VT = V0.getSimpleValueType();
6934 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6935 "Invalid nodes in input!");
6937 unsigned NumElts = VT.getVectorNumElements();
6938 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6939 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6940 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6941 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6942 MVT NewVT = V0_LO.getSimpleValueType();
6944 SDValue LO = DAG.getUNDEF(NewVT);
6945 SDValue HI = DAG.getUNDEF(NewVT);
6948 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6949 if (!isUndefLO && !V0->isUndef())
6950 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6951 if (!isUndefHI && !V1->isUndef())
6952 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6954 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6955 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6956 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6958 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6959 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6962 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6965 /// Try to fold a build_vector that performs an 'addsub' to an X86ISD::ADDSUB
6967 static SDValue LowerToAddSub(const BuildVectorSDNode *BV,
6968 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
6969 MVT VT = BV->getSimpleValueType(0);
6970 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6971 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
6975 unsigned NumElts = VT.getVectorNumElements();
6976 SDValue InVec0 = DAG.getUNDEF(VT);
6977 SDValue InVec1 = DAG.getUNDEF(VT);
6979 assert((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
6980 VT == MVT::v2f64) && "build_vector with an invalid type found!");
6982 // Odd-numbered elements in the input build vector are obtained from
6983 // adding two integer/float elements.
6984 // Even-numbered elements in the input build vector are obtained from
6985 // subtracting two integer/float elements.
6986 unsigned ExpectedOpcode = ISD::FSUB;
6987 unsigned NextExpectedOpcode = ISD::FADD;
6988 bool AddFound = false;
6989 bool SubFound = false;
6991 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6992 SDValue Op = BV->getOperand(i);
6994 // Skip 'undef' values.
6995 unsigned Opcode = Op.getOpcode();
6996 if (Opcode == ISD::UNDEF) {
6997 std::swap(ExpectedOpcode, NextExpectedOpcode);
7001 // Early exit if we found an unexpected opcode.
7002 if (Opcode != ExpectedOpcode)
7005 SDValue Op0 = Op.getOperand(0);
7006 SDValue Op1 = Op.getOperand(1);
7008 // Try to match the following pattern:
7009 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7010 // Early exit if we cannot match that sequence.
7011 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7012 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7013 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7014 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7015 Op0.getOperand(1) != Op1.getOperand(1))
7018 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7022 // We found a valid add/sub node. Update the information accordingly.
7028 // Update InVec0 and InVec1.
7029 if (InVec0.isUndef()) {
7030 InVec0 = Op0.getOperand(0);
7031 if (InVec0.getSimpleValueType() != VT)
7034 if (InVec1.isUndef()) {
7035 InVec1 = Op1.getOperand(0);
7036 if (InVec1.getSimpleValueType() != VT)
7040 // Make sure that operands in input to each add/sub node always
7041 // come from a same pair of vectors.
7042 if (InVec0 != Op0.getOperand(0)) {
7043 if (ExpectedOpcode == ISD::FSUB)
7046 // FADD is commutable. Try to commute the operands
7047 // and then test again.
7048 std::swap(Op0, Op1);
7049 if (InVec0 != Op0.getOperand(0))
7053 if (InVec1 != Op1.getOperand(0))
7056 // Update the pair of expected opcodes.
7057 std::swap(ExpectedOpcode, NextExpectedOpcode);
7060 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7061 if (AddFound && SubFound && !InVec0.isUndef() && !InVec1.isUndef())
7062 return DAG.getNode(X86ISD::ADDSUB, DL, VT, InVec0, InVec1);
7067 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7068 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7069 const X86Subtarget &Subtarget,
7070 SelectionDAG &DAG) {
7071 MVT VT = BV->getSimpleValueType(0);
7072 unsigned NumElts = VT.getVectorNumElements();
7073 unsigned NumUndefsLO = 0;
7074 unsigned NumUndefsHI = 0;
7075 unsigned Half = NumElts/2;
7077 // Count the number of UNDEF operands in the build_vector in input.
7078 for (unsigned i = 0, e = Half; i != e; ++i)
7079 if (BV->getOperand(i)->isUndef())
7082 for (unsigned i = Half, e = NumElts; i != e; ++i)
7083 if (BV->getOperand(i)->isUndef())
7086 // Early exit if this is either a build_vector of all UNDEFs or all the
7087 // operands but one are UNDEF.
7088 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7092 SDValue InVec0, InVec1;
7093 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7094 // Try to match an SSE3 float HADD/HSUB.
7095 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7096 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7098 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7099 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7100 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7101 // Try to match an SSSE3 integer HADD/HSUB.
7102 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7103 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7105 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7106 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7109 if (!Subtarget.hasAVX())
7112 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7113 // Try to match an AVX horizontal add/sub of packed single/double
7114 // precision floating point values from 256-bit vectors.
7115 SDValue InVec2, InVec3;
7116 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7117 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7118 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7119 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7120 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7122 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7123 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7124 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7125 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7126 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7127 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7128 // Try to match an AVX2 horizontal add/sub of signed integers.
7129 SDValue InVec2, InVec3;
7131 bool CanFold = true;
7133 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7134 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7135 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7136 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7137 X86Opcode = X86ISD::HADD;
7138 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7139 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7140 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7141 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7142 X86Opcode = X86ISD::HSUB;
7147 // Fold this build_vector into a single horizontal add/sub.
7148 // Do this only if the target has AVX2.
7149 if (Subtarget.hasAVX2())
7150 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7152 // Do not try to expand this build_vector into a pair of horizontal
7153 // add/sub if we can emit a pair of scalar add/sub.
7154 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7157 // Convert this build_vector into a pair of horizontal binop followed by
7159 bool isUndefLO = NumUndefsLO == Half;
7160 bool isUndefHI = NumUndefsHI == Half;
7161 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7162 isUndefLO, isUndefHI);
7166 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7167 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7169 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7170 X86Opcode = X86ISD::HADD;
7171 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7172 X86Opcode = X86ISD::HSUB;
7173 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7174 X86Opcode = X86ISD::FHADD;
7175 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7176 X86Opcode = X86ISD::FHSUB;
7180 // Don't try to expand this build_vector into a pair of horizontal add/sub
7181 // if we can simply emit a pair of scalar add/sub.
7182 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7185 // Convert this build_vector into two horizontal add/sub followed by
7187 bool isUndefLO = NumUndefsLO == Half;
7188 bool isUndefHI = NumUndefsHI == Half;
7189 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7190 isUndefLO, isUndefHI);
7196 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7197 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7198 /// just apply the bit to the vectors.
7199 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7200 /// from this, but enough scalar bit operations are created from the later
7201 /// legalization + scalarization stages to need basic support.
7202 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7203 SelectionDAG &DAG) {
7205 MVT VT = Op->getSimpleValueType(0);
7206 unsigned NumElems = VT.getVectorNumElements();
7207 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7209 // Check that all elements have the same opcode.
7210 // TODO: Should we allow UNDEFS and if so how many?
7211 unsigned Opcode = Op->getOperand(0).getOpcode();
7212 for (unsigned i = 1; i < NumElems; ++i)
7213 if (Opcode != Op->getOperand(i).getOpcode())
7216 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7223 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7228 SmallVector<SDValue, 4> LHSElts, RHSElts;
7229 for (SDValue Elt : Op->ops()) {
7230 SDValue LHS = Elt.getOperand(0);
7231 SDValue RHS = Elt.getOperand(1);
7233 // We expect the canonicalized RHS operand to be the constant.
7234 if (!isa<ConstantSDNode>(RHS))
7236 LHSElts.push_back(LHS);
7237 RHSElts.push_back(RHS);
7240 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7241 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7242 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7245 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7246 /// functionality to do this, so it's all zeros, all ones, or some derivation
7247 /// that is cheap to calculate.
7248 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7249 const X86Subtarget &Subtarget) {
7251 MVT VT = Op.getSimpleValueType();
7253 // Vectors containing all zeros can be matched by pxor and xorps.
7254 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7255 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7256 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7257 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7260 return getZeroVector(VT, Subtarget, DAG, DL);
7263 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7264 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7265 // vpcmpeqd on 256-bit vectors.
7266 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7267 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7268 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7271 return getOnesVector(VT, Subtarget, DAG, DL);
7278 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7281 MVT VT = Op.getSimpleValueType();
7282 MVT ExtVT = VT.getVectorElementType();
7283 unsigned NumElems = Op.getNumOperands();
7285 // Generate vectors for predicate vectors.
7286 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7287 return LowerBUILD_VECTORvXi1(Op, DAG);
7289 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7290 return VectorConstant;
7292 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7293 if (SDValue AddSub = LowerToAddSub(BV, Subtarget, DAG))
7295 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7296 return HorizontalOp;
7297 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7299 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7302 unsigned EVTBits = ExtVT.getSizeInBits();
7304 unsigned NumZero = 0;
7305 unsigned NumNonZero = 0;
7306 uint64_t NonZeros = 0;
7307 bool IsAllConstants = true;
7308 SmallSet<SDValue, 8> Values;
7309 for (unsigned i = 0; i < NumElems; ++i) {
7310 SDValue Elt = Op.getOperand(i);
7314 if (Elt.getOpcode() != ISD::Constant &&
7315 Elt.getOpcode() != ISD::ConstantFP)
7316 IsAllConstants = false;
7317 if (X86::isZeroNode(Elt))
7320 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7321 NonZeros |= ((uint64_t)1 << i);
7326 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7327 if (NumNonZero == 0)
7328 return DAG.getUNDEF(VT);
7330 // Special case for single non-zero, non-undef, element.
7331 if (NumNonZero == 1) {
7332 unsigned Idx = countTrailingZeros(NonZeros);
7333 SDValue Item = Op.getOperand(Idx);
7335 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7336 // the value are obviously zero, truncate the value to i32 and do the
7337 // insertion that way. Only do this if the value is non-constant or if the
7338 // value is a constant being inserted into element 0. It is cheaper to do
7339 // a constant pool load than it is to do a movd + shuffle.
7340 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7341 (!IsAllConstants || Idx == 0)) {
7342 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
7344 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7345 MVT VecVT = MVT::v4i32;
7347 // Truncate the value (which may itself be a constant) to i32, and
7348 // convert it to a vector with movd (S2V+shuffle to zero extend).
7349 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7350 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7351 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7352 Item, Idx * 2, true, Subtarget, DAG));
7356 // If we have a constant or non-constant insertion into the low element of
7357 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7358 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7359 // depending on what the source datatype is.
7362 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7364 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7365 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7366 assert((VT.is128BitVector() || VT.is256BitVector() ||
7367 VT.is512BitVector()) &&
7368 "Expected an SSE value type!");
7369 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7370 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7371 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7374 // We can't directly insert an i8 or i16 into a vector, so zero extend
7376 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7377 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7378 if (VT.getSizeInBits() >= 256) {
7379 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7380 if (Subtarget.hasAVX()) {
7381 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7382 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7384 // Without AVX, we need to extend to a 128-bit vector and then
7385 // insert into the 256-bit vector.
7386 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7387 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7388 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7391 assert(VT.is128BitVector() && "Expected an SSE value type!");
7392 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7393 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7395 return DAG.getBitcast(VT, Item);
7399 // Is it a vector logical left shift?
7400 if (NumElems == 2 && Idx == 1 &&
7401 X86::isZeroNode(Op.getOperand(0)) &&
7402 !X86::isZeroNode(Op.getOperand(1))) {
7403 unsigned NumBits = VT.getSizeInBits();
7404 return getVShift(true, VT,
7405 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7406 VT, Op.getOperand(1)),
7407 NumBits/2, DAG, *this, dl);
7410 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7413 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7414 // is a non-constant being inserted into an element other than the low one,
7415 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7416 // movd/movss) to move this into the low element, then shuffle it into
7418 if (EVTBits == 32) {
7419 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7420 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7424 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7425 if (Values.size() == 1) {
7426 if (EVTBits == 32) {
7427 // Instead of a shuffle like this:
7428 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7429 // Check if it's possible to issue this instead.
7430 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7431 unsigned Idx = countTrailingZeros(NonZeros);
7432 SDValue Item = Op.getOperand(Idx);
7433 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7434 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7439 // A vector full of immediates; various special cases are already
7440 // handled, so this is best done with a single constant-pool load.
7444 // See if we can use a vector load to get all of the elements.
7445 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7446 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7447 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7451 // For AVX-length vectors, build the individual 128-bit pieces and use
7452 // shuffles to put them in place.
7453 if (VT.is256BitVector() || VT.is512BitVector()) {
7454 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7456 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7458 // Build both the lower and upper subvector.
7460 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7461 SDValue Upper = DAG.getBuildVector(
7462 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7464 // Recreate the wider vector with the lower and upper part.
7465 if (VT.is256BitVector())
7466 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7467 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7470 // Let legalizer expand 2-wide build_vectors.
7471 if (EVTBits == 64) {
7472 if (NumNonZero == 1) {
7473 // One half is zero or undef.
7474 unsigned Idx = countTrailingZeros(NonZeros);
7475 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7476 Op.getOperand(Idx));
7477 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7482 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7483 if (EVTBits == 8 && NumElems == 16)
7484 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7485 DAG, Subtarget, *this))
7488 if (EVTBits == 16 && NumElems == 8)
7489 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7490 DAG, Subtarget, *this))
7493 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7494 if (EVTBits == 32 && NumElems == 4)
7495 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
7498 // If element VT is == 32 bits, turn it into a number of shuffles.
7499 if (NumElems == 4 && NumZero > 0) {
7500 SmallVector<SDValue, 8> Ops(NumElems);
7501 for (unsigned i = 0; i < 4; ++i) {
7502 bool isZero = !(NonZeros & (1ULL << i));
7504 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7506 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7509 for (unsigned i = 0; i < 2; ++i) {
7510 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7513 Ops[i] = Ops[i*2]; // Must be a zero vector.
7516 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7519 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7522 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7527 bool Reverse1 = (NonZeros & 0x3) == 2;
7528 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7532 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7533 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7535 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7538 if (Values.size() > 1 && VT.is128BitVector()) {
7539 // Check for a build vector from mostly shuffle plus few inserting.
7540 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7543 // For SSE 4.1, use insertps to put the high elements into the low element.
7544 if (Subtarget.hasSSE41()) {
7546 if (!Op.getOperand(0).isUndef())
7547 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7549 Result = DAG.getUNDEF(VT);
7551 for (unsigned i = 1; i < NumElems; ++i) {
7552 if (Op.getOperand(i).isUndef()) continue;
7553 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7554 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7559 // Otherwise, expand into a number of unpckl*, start by extending each of
7560 // our (non-undef) elements to the full vector width with the element in the
7561 // bottom slot of the vector (which generates no code for SSE).
7562 SmallVector<SDValue, 8> Ops(NumElems);
7563 for (unsigned i = 0; i < NumElems; ++i) {
7564 if (!Op.getOperand(i).isUndef())
7565 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7567 Ops[i] = DAG.getUNDEF(VT);
7570 // Next, we iteratively mix elements, e.g. for v4f32:
7571 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7572 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7573 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7574 unsigned EltStride = NumElems >> 1;
7575 while (EltStride != 0) {
7576 for (unsigned i = 0; i < EltStride; ++i) {
7577 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7578 // then it is safe to just drop this shuffle: V[i] is already in the
7579 // right place, the one element (since it's the first round) being
7580 // inserted as undef can be dropped. This isn't safe for successive
7581 // rounds because they will permute elements within both vectors.
7582 if (Ops[i+EltStride].isUndef() &&
7583 EltStride == NumElems/2)
7586 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7595 // 256-bit AVX can use the vinsertf128 instruction
7596 // to create 256-bit vectors from two other 128-bit ones.
7597 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7599 MVT ResVT = Op.getSimpleValueType();
7601 assert((ResVT.is256BitVector() ||
7602 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7604 SDValue V1 = Op.getOperand(0);
7605 SDValue V2 = Op.getOperand(1);
7606 unsigned NumElems = ResVT.getVectorNumElements();
7607 if (ResVT.is256BitVector())
7608 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7610 if (Op.getNumOperands() == 4) {
7611 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7612 ResVT.getVectorNumElements()/2);
7613 SDValue V3 = Op.getOperand(2);
7614 SDValue V4 = Op.getOperand(3);
7615 return concat256BitVectors(
7616 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7617 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7620 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7623 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7624 const X86Subtarget &Subtarget,
7625 SelectionDAG & DAG) {
7627 MVT ResVT = Op.getSimpleValueType();
7628 unsigned NumOfOperands = Op.getNumOperands();
7630 assert(isPowerOf2_32(NumOfOperands) &&
7631 "Unexpected number of operands in CONCAT_VECTORS");
7633 SDValue Undef = DAG.getUNDEF(ResVT);
7634 if (NumOfOperands > 2) {
7635 // Specialize the cases when all, or all but one, of the operands are undef.
7636 unsigned NumOfDefinedOps = 0;
7638 for (unsigned i = 0; i < NumOfOperands; i++)
7639 if (!Op.getOperand(i).isUndef()) {
7643 if (NumOfDefinedOps == 0)
7645 if (NumOfDefinedOps == 1) {
7646 unsigned SubVecNumElts =
7647 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7648 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7649 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7650 Op.getOperand(OpIdx), IdxVal);
7653 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7654 ResVT.getVectorNumElements()/2);
7655 SmallVector<SDValue, 2> Ops;
7656 for (unsigned i = 0; i < NumOfOperands/2; i++)
7657 Ops.push_back(Op.getOperand(i));
7658 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7660 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7661 Ops.push_back(Op.getOperand(i));
7662 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7663 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7667 SDValue V1 = Op.getOperand(0);
7668 SDValue V2 = Op.getOperand(1);
7669 unsigned NumElems = ResVT.getVectorNumElements();
7670 assert(V1.getValueType() == V2.getValueType() &&
7671 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7672 "Unexpected operands in CONCAT_VECTORS");
7674 if (ResVT.getSizeInBits() >= 16)
7675 return Op; // The operation is legal with KUNPCK
7677 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7678 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7679 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7680 if (IsZeroV1 && IsZeroV2)
7683 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7685 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7687 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7689 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7691 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7694 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7696 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7697 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7700 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7701 const X86Subtarget &Subtarget,
7702 SelectionDAG &DAG) {
7703 MVT VT = Op.getSimpleValueType();
7704 if (VT.getVectorElementType() == MVT::i1)
7705 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7707 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7708 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7709 Op.getNumOperands() == 4)));
7711 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7712 // from two other 128-bit ones.
7714 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7715 return LowerAVXCONCAT_VECTORS(Op, DAG);
7718 //===----------------------------------------------------------------------===//
7719 // Vector shuffle lowering
7721 // This is an experimental code path for lowering vector shuffles on x86. It is
7722 // designed to handle arbitrary vector shuffles and blends, gracefully
7723 // degrading performance as necessary. It works hard to recognize idiomatic
7724 // shuffles and lower them to optimal instruction patterns without leaving
7725 // a framework that allows reasonably efficient handling of all vector shuffle
7727 //===----------------------------------------------------------------------===//
7729 /// \brief Tiny helper function to identify a no-op mask.
7731 /// This is a somewhat boring predicate function. It checks whether the mask
7732 /// array input, which is assumed to be a single-input shuffle mask of the kind
7733 /// used by the X86 shuffle instructions (not a fully general
7734 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7735 /// in-place shuffle are 'no-op's.
7736 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7737 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7738 assert(Mask[i] >= -1 && "Out of bound mask element!");
7739 if (Mask[i] >= 0 && Mask[i] != i)
7745 /// \brief Test whether there are elements crossing 128-bit lanes in this
7748 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7749 /// and we routinely test for these.
7750 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7751 int LaneSize = 128 / VT.getScalarSizeInBits();
7752 int Size = Mask.size();
7753 for (int i = 0; i < Size; ++i)
7754 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7759 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7761 /// This checks a shuffle mask to see if it is performing the same
7762 /// lane-relative shuffle in each sub-lane. This trivially implies
7763 /// that it is also not lane-crossing. It may however involve a blend from the
7764 /// same lane of a second vector.
7766 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7767 /// non-trivial to compute in the face of undef lanes. The representation is
7768 /// suitable for use with existing 128-bit shuffles as entries from the second
7769 /// vector have been remapped to [LaneSize, 2*LaneSize).
7770 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7772 SmallVectorImpl<int> &RepeatedMask) {
7773 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7774 RepeatedMask.assign(LaneSize, -1);
7775 int Size = Mask.size();
7776 for (int i = 0; i < Size; ++i) {
7777 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
7780 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7781 // This entry crosses lanes, so there is no way to model this shuffle.
7784 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7785 // Adjust second vector indices to start at LaneSize instead of Size.
7786 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7787 : Mask[i] % LaneSize + LaneSize;
7788 if (RepeatedMask[i % LaneSize] < 0)
7789 // This is the first non-undef entry in this slot of a 128-bit lane.
7790 RepeatedMask[i % LaneSize] = LocalM;
7791 else if (RepeatedMask[i % LaneSize] != LocalM)
7792 // Found a mismatch with the repeated mask.
7798 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7800 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7801 SmallVectorImpl<int> &RepeatedMask) {
7802 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7805 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7807 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7808 SmallVectorImpl<int> &RepeatedMask) {
7809 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7812 /// Test whether a target shuffle mask is equivalent within each sub-lane.
7813 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
7814 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
7816 SmallVectorImpl<int> &RepeatedMask) {
7817 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7818 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
7819 int Size = Mask.size();
7820 for (int i = 0; i < Size; ++i) {
7821 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
7822 if (Mask[i] == SM_SentinelUndef)
7824 if (Mask[i] == SM_SentinelZero) {
7825 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
7827 RepeatedMask[i % LaneSize] = SM_SentinelZero;
7830 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7831 // This entry crosses lanes, so there is no way to model this shuffle.
7834 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7835 // Adjust second vector indices to start at LaneSize instead of Size.
7837 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
7838 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
7839 // This is the first non-undef entry in this slot of a 128-bit lane.
7840 RepeatedMask[i % LaneSize] = LocalM;
7841 else if (RepeatedMask[i % LaneSize] != LocalM)
7842 // Found a mismatch with the repeated mask.
7848 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7851 /// This is a fast way to test a shuffle mask against a fixed pattern:
7853 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7855 /// It returns true if the mask is exactly as wide as the argument list, and
7856 /// each element of the mask is either -1 (signifying undef) or the value given
7857 /// in the argument.
7858 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7859 ArrayRef<int> ExpectedMask) {
7860 if (Mask.size() != ExpectedMask.size())
7863 int Size = Mask.size();
7865 // If the values are build vectors, we can look through them to find
7866 // equivalent inputs that make the shuffles equivalent.
7867 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7868 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7870 for (int i = 0; i < Size; ++i) {
7871 assert(Mask[i] >= -1 && "Out of bound mask element!");
7872 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7873 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7874 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7875 if (!MaskBV || !ExpectedBV ||
7876 MaskBV->getOperand(Mask[i] % Size) !=
7877 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7885 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7887 /// The masks must be exactly the same width.
7889 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7890 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7892 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7893 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7894 ArrayRef<int> ExpectedMask) {
7895 int Size = Mask.size();
7896 if (Size != (int)ExpectedMask.size())
7899 for (int i = 0; i < Size; ++i)
7900 if (Mask[i] == SM_SentinelUndef)
7902 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7904 else if (Mask[i] != ExpectedMask[i])
7910 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7912 /// This helper function produces an 8-bit shuffle immediate corresponding to
7913 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7914 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7917 /// NB: We rely heavily on "undef" masks preserving the input lane.
7918 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7919 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7920 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
7921 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
7922 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
7923 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
7926 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
7927 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
7928 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
7929 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
7933 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
7934 SelectionDAG &DAG) {
7935 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
7938 /// \brief Compute whether each element of a shuffle is zeroable.
7940 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
7941 /// Either it is an undef element in the shuffle mask, the element of the input
7942 /// referenced is undef, or the element of the input referenced is known to be
7943 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
7944 /// as many lanes with this technique as possible to simplify the remaining
7946 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
7947 SDValue V1, SDValue V2) {
7948 SmallBitVector Zeroable(Mask.size(), false);
7949 V1 = peekThroughBitcasts(V1);
7950 V2 = peekThroughBitcasts(V2);
7952 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
7953 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
7955 int VectorSizeInBits = V1.getValueSizeInBits();
7956 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
7957 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
7959 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7961 // Handle the easy cases.
7962 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
7967 // Determine shuffle input and normalize the mask.
7968 SDValue V = M < Size ? V1 : V2;
7971 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
7972 if (V.getOpcode() != ISD::BUILD_VECTOR)
7975 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
7976 // the (larger) source element must be UNDEF/ZERO.
7977 if ((Size % V.getNumOperands()) == 0) {
7978 int Scale = Size / V->getNumOperands();
7979 SDValue Op = V.getOperand(M / Scale);
7980 if (Op.isUndef() || X86::isZeroNode(Op))
7982 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
7983 APInt Val = Cst->getAPIntValue();
7984 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7985 Val = Val.getLoBits(ScalarSizeInBits);
7986 Zeroable[i] = (Val == 0);
7987 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
7988 APInt Val = Cst->getValueAPF().bitcastToAPInt();
7989 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
7990 Val = Val.getLoBits(ScalarSizeInBits);
7991 Zeroable[i] = (Val == 0);
7996 // If the BUILD_VECTOR has more elements then all the (smaller) source
7997 // elements must be UNDEF or ZERO.
7998 if ((V.getNumOperands() % Size) == 0) {
7999 int Scale = V->getNumOperands() / Size;
8000 bool AllZeroable = true;
8001 for (int j = 0; j < Scale; ++j) {
8002 SDValue Op = V.getOperand((M * Scale) + j);
8003 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8005 Zeroable[i] = AllZeroable;
8013 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8014 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8015 ArrayRef<int> Mask, SDValue V1,
8017 const SmallBitVector &Zeroable,
8018 const X86Subtarget &Subtarget,
8019 SelectionDAG &DAG) {
8020 int Size = Mask.size();
8021 int LaneSize = 128 / VT.getScalarSizeInBits();
8022 const int NumBytes = VT.getSizeInBits() / 8;
8023 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8025 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8026 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8027 (Subtarget.hasBWI() && VT.is512BitVector()));
8029 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8030 // Sign bit set in i8 mask means zero element.
8031 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8034 for (int i = 0; i < NumBytes; ++i) {
8035 int M = Mask[i / NumEltBytes];
8037 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8040 if (Zeroable[i / NumEltBytes]) {
8041 PSHUFBMask[i] = ZeroMask;
8045 // We can only use a single input of V1 or V2.
8046 SDValue SrcV = (M >= Size ? V2 : V1);
8052 // PSHUFB can't cross lanes, ensure this doesn't happen.
8053 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8057 M = M * NumEltBytes + (i % NumEltBytes);
8058 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8060 assert(V && "Failed to find a source input");
8062 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8063 return DAG.getBitcast(
8064 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8065 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8068 // X86 has dedicated unpack instructions that can handle specific blend
8069 // operations: UNPCKH and UNPCKL.
8070 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8071 ArrayRef<int> Mask, SDValue V1,
8072 SDValue V2, SelectionDAG &DAG) {
8073 SmallVector<int, 8> Unpckl;
8074 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8075 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8076 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8078 SmallVector<int, 8> Unpckh;
8079 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8080 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8081 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8083 // Commute and try again.
8084 ShuffleVectorSDNode::commuteMask(Unpckl);
8085 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8086 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8088 ShuffleVectorSDNode::commuteMask(Unpckh);
8089 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8090 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8095 /// \brief Try to emit a bitmask instruction for a shuffle.
8097 /// This handles cases where we can model a blend exactly as a bitmask due to
8098 /// one of the inputs being zeroable.
8099 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8100 SDValue V2, ArrayRef<int> Mask,
8101 const SmallBitVector &Zeroable,
8102 SelectionDAG &DAG) {
8103 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8104 MVT EltVT = VT.getVectorElementType();
8105 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8107 DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
8108 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8110 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8113 if (Mask[i] % Size != i)
8114 return SDValue(); // Not a blend.
8116 V = Mask[i] < Size ? V1 : V2;
8117 else if (V != (Mask[i] < Size ? V1 : V2))
8118 return SDValue(); // Can only let one input through the mask.
8120 VMaskOps[i] = AllOnes;
8123 return SDValue(); // No non-zeroable elements!
8125 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8126 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8129 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8131 /// This is used as a fallback approach when first class blend instructions are
8132 /// unavailable. Currently it is only suitable for integer vectors, but could
8133 /// be generalized for floating point vectors if desirable.
8134 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8135 SDValue V2, ArrayRef<int> Mask,
8136 SelectionDAG &DAG) {
8137 assert(VT.isInteger() && "Only supports integer vector types!");
8138 MVT EltVT = VT.getVectorElementType();
8139 int NumEltBits = EltVT.getSizeInBits();
8140 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8141 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
8143 SmallVector<SDValue, 16> MaskOps;
8144 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8145 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8146 return SDValue(); // Shuffled input!
8147 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8150 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8151 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8152 // We have to cast V2 around.
8153 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8154 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8155 DAG.getBitcast(MaskVT, V1Mask),
8156 DAG.getBitcast(MaskVT, V2)));
8157 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8160 /// \brief Try to emit a blend instruction for a shuffle.
8162 /// This doesn't do any checks for the availability of instructions for blending
8163 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8164 /// be matched in the backend with the type given. What it does check for is
8165 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8166 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8167 SDValue V2, ArrayRef<int> Original,
8168 const SmallBitVector &Zeroable,
8169 const X86Subtarget &Subtarget,
8170 SelectionDAG &DAG) {
8171 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8172 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8173 SmallVector<int, 8> Mask(Original.begin(), Original.end());
8174 bool ForceV1Zero = false, ForceV2Zero = false;
8176 // Attempt to generate the binary blend mask. If an input is zero then
8177 // we can use any lane.
8178 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8179 unsigned BlendMask = 0;
8180 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8186 if (M == i + Size) {
8187 BlendMask |= 1u << i;
8198 BlendMask |= 1u << i;
8203 return SDValue(); // Shuffled input!
8206 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8208 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8210 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8212 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
8213 unsigned ScaledMask = 0;
8214 for (int i = 0; i != Size; ++i)
8215 if (BlendMask & (1u << i))
8216 for (int j = 0; j != Scale; ++j)
8217 ScaledMask |= 1u << (i * Scale + j);
8221 switch (VT.SimpleTy) {
8226 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8227 DAG.getConstant(BlendMask, DL, MVT::i8));
8231 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8235 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8236 // that instruction.
8237 if (Subtarget.hasAVX2()) {
8238 // Scale the blend by the number of 32-bit dwords per element.
8239 int Scale = VT.getScalarSizeInBits() / 32;
8240 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8241 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8242 V1 = DAG.getBitcast(BlendVT, V1);
8243 V2 = DAG.getBitcast(BlendVT, V2);
8244 return DAG.getBitcast(
8245 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8246 DAG.getConstant(BlendMask, DL, MVT::i8)));
8250 // For integer shuffles we need to expand the mask and cast the inputs to
8251 // v8i16s prior to blending.
8252 int Scale = 8 / VT.getVectorNumElements();
8253 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8254 V1 = DAG.getBitcast(MVT::v8i16, V1);
8255 V2 = DAG.getBitcast(MVT::v8i16, V2);
8256 return DAG.getBitcast(VT,
8257 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8258 DAG.getConstant(BlendMask, DL, MVT::i8)));
8262 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8263 SmallVector<int, 8> RepeatedMask;
8264 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8265 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8266 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8268 for (int i = 0; i < 8; ++i)
8269 if (RepeatedMask[i] >= 8)
8270 BlendMask |= 1u << i;
8271 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8272 DAG.getConstant(BlendMask, DL, MVT::i8));
8278 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8279 "256-bit byte-blends require AVX2 support!");
8281 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8282 if (SDValue Masked =
8283 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8286 // Scale the blend by the number of bytes per element.
8287 int Scale = VT.getScalarSizeInBits() / 8;
8289 // This form of blend is always done on bytes. Compute the byte vector
8291 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8293 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8294 // mix of LLVM's code generator and the x86 backend. We tell the code
8295 // generator that boolean values in the elements of an x86 vector register
8296 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8297 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8298 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8299 // of the element (the remaining are ignored) and 0 in that high bit would
8300 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8301 // the LLVM model for boolean values in vector elements gets the relevant
8302 // bit set, it is set backwards and over constrained relative to x86's
8304 SmallVector<SDValue, 32> VSELECTMask;
8305 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8306 for (int j = 0; j < Scale; ++j)
8307 VSELECTMask.push_back(
8308 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8309 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8312 V1 = DAG.getBitcast(BlendVT, V1);
8313 V2 = DAG.getBitcast(BlendVT, V2);
8314 return DAG.getBitcast(
8315 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8316 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8320 llvm_unreachable("Not a supported integer vector type!");
8324 /// \brief Try to lower as a blend of elements from two inputs followed by
8325 /// a single-input permutation.
8327 /// This matches the pattern where we can blend elements from two inputs and
8328 /// then reduce the shuffle to a single-input permutation.
8329 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8330 SDValue V1, SDValue V2,
8332 SelectionDAG &DAG) {
8333 // We build up the blend mask while checking whether a blend is a viable way
8334 // to reduce the shuffle.
8335 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8336 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8338 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8342 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8344 if (BlendMask[Mask[i] % Size] < 0)
8345 BlendMask[Mask[i] % Size] = Mask[i];
8346 else if (BlendMask[Mask[i] % Size] != Mask[i])
8347 return SDValue(); // Can't blend in the needed input!
8349 PermuteMask[i] = Mask[i] % Size;
8352 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8353 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8356 /// \brief Generic routine to decompose a shuffle and blend into indepndent
8357 /// blends and permutes.
8359 /// This matches the extremely common pattern for handling combined
8360 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8361 /// operations. It will try to pick the best arrangement of shuffles and
8363 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8367 SelectionDAG &DAG) {
8368 // Shuffle the input elements into the desired positions in V1 and V2 and
8369 // blend them together.
8370 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8371 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8372 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8373 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8374 if (Mask[i] >= 0 && Mask[i] < Size) {
8375 V1Mask[i] = Mask[i];
8377 } else if (Mask[i] >= Size) {
8378 V2Mask[i] = Mask[i] - Size;
8379 BlendMask[i] = i + Size;
8382 // Try to lower with the simpler initial blend strategy unless one of the
8383 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8384 // shuffle may be able to fold with a load or other benefit. However, when
8385 // we'll have to do 2x as many shuffles in order to achieve this, blending
8386 // first is a better strategy.
8387 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8388 if (SDValue BlendPerm =
8389 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8392 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8393 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8394 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8397 /// \brief Try to lower a vector shuffle as a rotation.
8399 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8400 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8401 ArrayRef<int> Mask) {
8402 int NumElts = Mask.size();
8404 // We need to detect various ways of spelling a rotation:
8405 // [11, 12, 13, 14, 15, 0, 1, 2]
8406 // [-1, 12, 13, 14, -1, -1, 1, -1]
8407 // [-1, -1, -1, -1, -1, -1, 1, 2]
8408 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8409 // [-1, 4, 5, 6, -1, -1, 9, -1]
8410 // [-1, 4, 5, 6, -1, -1, -1, -1]
8413 for (int i = 0; i < NumElts; ++i) {
8415 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8416 "Unexpected mask index.");
8420 // Determine where a rotated vector would have started.
8421 int StartIdx = i - (M % NumElts);
8423 // The identity rotation isn't interesting, stop.
8426 // If we found the tail of a vector the rotation must be the missing
8427 // front. If we found the head of a vector, it must be how much of the
8429 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8432 Rotation = CandidateRotation;
8433 else if (Rotation != CandidateRotation)
8434 // The rotations don't match, so we can't match this mask.
8437 // Compute which value this mask is pointing at.
8438 SDValue MaskV = M < NumElts ? V1 : V2;
8440 // Compute which of the two target values this index should be assigned
8441 // to. This reflects whether the high elements are remaining or the low
8442 // elements are remaining.
8443 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8445 // Either set up this value if we've not encountered it before, or check
8446 // that it remains consistent.
8449 else if (TargetV != MaskV)
8450 // This may be a rotation, but it pulls from the inputs in some
8451 // unsupported interleaving.
8455 // Check that we successfully analyzed the mask, and normalize the results.
8456 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8457 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8469 /// \brief Try to lower a vector shuffle as a byte rotation.
8471 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8472 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8473 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8474 /// try to generically lower a vector shuffle through such an pattern. It
8475 /// does not check for the profitability of lowering either as PALIGNR or
8476 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8477 /// This matches shuffle vectors that look like:
8479 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8481 /// Essentially it concatenates V1 and V2, shifts right by some number of
8482 /// elements, and takes the low elements as the result. Note that while this is
8483 /// specified as a *right shift* because x86 is little-endian, it is a *left
8484 /// rotate* of the vector lanes.
8485 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8486 ArrayRef<int> Mask) {
8487 // Don't accept any shuffles with zero elements.
8488 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8491 // PALIGNR works on 128-bit lanes.
8492 SmallVector<int, 16> RepeatedMask;
8493 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8496 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8500 // PALIGNR rotates bytes, so we need to scale the
8501 // rotation based on how many bytes are in the vector lane.
8502 int NumElts = RepeatedMask.size();
8503 int Scale = 16 / NumElts;
8504 return Rotation * Scale;
8507 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8508 SDValue V1, SDValue V2,
8510 const X86Subtarget &Subtarget,
8511 SelectionDAG &DAG) {
8512 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8514 SDValue Lo = V1, Hi = V2;
8515 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8516 if (ByteRotation <= 0)
8519 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8521 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8522 Lo = DAG.getBitcast(ByteVT, Lo);
8523 Hi = DAG.getBitcast(ByteVT, Hi);
8525 // SSSE3 targets can use the palignr instruction.
8526 if (Subtarget.hasSSSE3()) {
8527 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8528 "512-bit PALIGNR requires BWI instructions");
8529 return DAG.getBitcast(
8530 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8531 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8534 assert(VT.is128BitVector() &&
8535 "Rotate-based lowering only supports 128-bit lowering!");
8536 assert(Mask.size() <= 16 &&
8537 "Can shuffle at most 16 bytes in a 128-bit vector!");
8538 assert(ByteVT == MVT::v16i8 &&
8539 "SSE2 rotate lowering only needed for v16i8!");
8541 // Default SSE2 implementation
8542 int LoByteShift = 16 - ByteRotation;
8543 int HiByteShift = ByteRotation;
8545 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
8546 DAG.getConstant(LoByteShift, DL, MVT::i8));
8547 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
8548 DAG.getConstant(HiByteShift, DL, MVT::i8));
8549 return DAG.getBitcast(VT,
8550 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
8553 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
8555 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
8556 /// rotation of the concatenation of two vectors; This routine will
8557 /// try to generically lower a vector shuffle through such an pattern.
8559 /// Essentially it concatenates V1 and V2, shifts right by some number of
8560 /// elements, and takes the low elements as the result. Note that while this is
8561 /// specified as a *right shift* because x86 is little-endian, it is a *left
8562 /// rotate* of the vector lanes.
8563 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
8564 SDValue V1, SDValue V2,
8566 const X86Subtarget &Subtarget,
8567 SelectionDAG &DAG) {
8568 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
8569 "Only 32-bit and 64-bit elements are supported!");
8571 // 128/256-bit vectors are only supported with VLX.
8572 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
8573 && "VLX required for 128/256-bit vectors");
8575 SDValue Lo = V1, Hi = V2;
8576 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
8580 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
8581 DAG.getConstant(Rotation, DL, MVT::i8));
8584 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
8586 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
8587 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
8588 /// matches elements from one of the input vectors shuffled to the left or
8589 /// right with zeroable elements 'shifted in'. It handles both the strictly
8590 /// bit-wise element shifts and the byte shift across an entire 128-bit double
8593 /// PSHL : (little-endian) left bit shift.
8594 /// [ zz, 0, zz, 2 ]
8595 /// [ -1, 4, zz, -1 ]
8596 /// PSRL : (little-endian) right bit shift.
8598 /// [ -1, -1, 7, zz]
8599 /// PSLLDQ : (little-endian) left byte shift
8600 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
8601 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
8602 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
8603 /// PSRLDQ : (little-endian) right byte shift
8604 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
8605 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
8606 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
8607 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
8608 unsigned ScalarSizeInBits,
8609 ArrayRef<int> Mask, int MaskOffset,
8610 const SmallBitVector &Zeroable,
8611 const X86Subtarget &Subtarget) {
8612 int Size = Mask.size();
8613 unsigned SizeInBits = Size * ScalarSizeInBits;
8615 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
8616 for (int i = 0; i < Size; i += Scale)
8617 for (int j = 0; j < Shift; ++j)
8618 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
8624 auto MatchShift = [&](int Shift, int Scale, bool Left) {
8625 for (int i = 0; i != Size; i += Scale) {
8626 unsigned Pos = Left ? i + Shift : i;
8627 unsigned Low = Left ? i : i + Shift;
8628 unsigned Len = Scale - Shift;
8629 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
8633 int ShiftEltBits = ScalarSizeInBits * Scale;
8634 bool ByteShift = ShiftEltBits > 64;
8635 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
8636 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
8637 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
8639 // Normalize the scale for byte shifts to still produce an i64 element
8641 Scale = ByteShift ? Scale / 2 : Scale;
8643 // We need to round trip through the appropriate type for the shift.
8644 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
8645 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
8646 : MVT::getVectorVT(ShiftSVT, Size / Scale);
8647 return (int)ShiftAmt;
8650 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
8651 // keep doubling the size of the integer elements up to that. We can
8652 // then shift the elements of the integer vector by whole multiples of
8653 // their width within the elements of the larger integer vector. Test each
8654 // multiple to see if we can find a match with the moved element indices
8655 // and that the shifted in elements are all zeroable.
8656 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
8657 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
8658 for (int Shift = 1; Shift != Scale; ++Shift)
8659 for (bool Left : {true, false})
8660 if (CheckZeros(Shift, Scale, Left)) {
8661 int ShiftAmt = MatchShift(Shift, Scale, Left);
8670 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
8671 SDValue V2, ArrayRef<int> Mask,
8672 const SmallBitVector &Zeroable,
8673 const X86Subtarget &Subtarget,
8674 SelectionDAG &DAG) {
8675 int Size = Mask.size();
8676 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8682 // Try to match shuffle against V1 shift.
8683 int ShiftAmt = matchVectorShuffleAsShift(
8684 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
8686 // If V1 failed, try to match shuffle against V2 shift.
8689 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
8690 Mask, Size, Zeroable, Subtarget);
8697 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
8698 "Illegal integer vector type");
8699 V = DAG.getBitcast(ShiftVT, V);
8700 V = DAG.getNode(Opcode, DL, ShiftVT, V,
8701 DAG.getConstant(ShiftAmt, DL, MVT::i8));
8702 return DAG.getBitcast(VT, V);
8705 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
8706 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
8707 SDValue V2, ArrayRef<int> Mask,
8708 const SmallBitVector &Zeroable,
8709 SelectionDAG &DAG) {
8710 int Size = Mask.size();
8711 int HalfSize = Size / 2;
8712 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8713 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
8715 // Upper half must be undefined.
8716 if (!isUndefInRange(Mask, HalfSize, HalfSize))
8719 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
8720 // Remainder of lower half result is zero and upper half is all undef.
8721 auto LowerAsEXTRQ = [&]() {
8722 // Determine the extraction length from the part of the
8723 // lower half that isn't zeroable.
8725 for (; Len > 0; --Len)
8726 if (!Zeroable[Len - 1])
8728 assert(Len > 0 && "Zeroable shuffle mask");
8730 // Attempt to match first Len sequential elements from the lower half.
8733 for (int i = 0; i != Len; ++i) {
8737 SDValue &V = (M < Size ? V1 : V2);
8740 // The extracted elements must start at a valid index and all mask
8741 // elements must be in the lower half.
8742 if (i > M || M >= HalfSize)
8745 if (Idx < 0 || (Src == V && Idx == (M - i))) {
8756 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
8757 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8758 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8759 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
8760 DAG.getConstant(BitLen, DL, MVT::i8),
8761 DAG.getConstant(BitIdx, DL, MVT::i8));
8764 if (SDValue ExtrQ = LowerAsEXTRQ())
8767 // INSERTQ: Extract lowest Len elements from lower half of second source and
8768 // insert over first source, starting at Idx.
8769 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
8770 auto LowerAsInsertQ = [&]() {
8771 for (int Idx = 0; Idx != HalfSize; ++Idx) {
8774 // Attempt to match first source from mask before insertion point.
8775 if (isUndefInRange(Mask, 0, Idx)) {
8777 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
8779 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
8785 // Extend the extraction length looking to match both the insertion of
8786 // the second source and the remaining elements of the first.
8787 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8792 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8794 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8800 // Match the remaining elements of the lower half.
8801 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8803 } else if ((!Base || (Base == V1)) &&
8804 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8806 } else if ((!Base || (Base == V2)) &&
8807 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8814 // We may not have a base (first source) - this can safely be undefined.
8816 Base = DAG.getUNDEF(VT);
8818 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8819 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8820 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8821 DAG.getConstant(BitLen, DL, MVT::i8),
8822 DAG.getConstant(BitIdx, DL, MVT::i8));
8829 if (SDValue InsertQ = LowerAsInsertQ())
8835 /// \brief Lower a vector shuffle as a zero or any extension.
8837 /// Given a specific number of elements, element bit width, and extension
8838 /// stride, produce either a zero or any extension based on the available
8839 /// features of the subtarget. The extended elements are consecutive and
8840 /// begin and can start from an offseted element index in the input; to
8841 /// avoid excess shuffling the offset must either being in the bottom lane
8842 /// or at the start of a higher lane. All extended elements must be from
8844 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8845 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8846 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8847 assert(Scale > 1 && "Need a scale to extend.");
8848 int EltBits = VT.getScalarSizeInBits();
8849 int NumElements = VT.getVectorNumElements();
8850 int NumEltsPerLane = 128 / EltBits;
8851 int OffsetLane = Offset / NumEltsPerLane;
8852 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
8853 "Only 8, 16, and 32 bit elements can be extended.");
8854 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
8855 assert(0 <= Offset && "Extension offset must be positive.");
8856 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
8857 "Extension offset must be in the first lane or start an upper lane.");
8859 // Check that an index is in same lane as the base offset.
8860 auto SafeOffset = [&](int Idx) {
8861 return OffsetLane == (Idx / NumEltsPerLane);
8864 // Shift along an input so that the offset base moves to the first element.
8865 auto ShuffleOffset = [&](SDValue V) {
8869 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8870 for (int i = 0; i * Scale < NumElements; ++i) {
8871 int SrcIdx = i + Offset;
8872 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
8874 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
8877 // Found a valid zext mask! Try various lowering strategies based on the
8878 // input type and available ISA extensions.
8879 if (Subtarget.hasSSE41()) {
8880 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
8881 // PUNPCK will catch this in a later shuffle match.
8882 if (Offset && Scale == 2 && VT.is128BitVector())
8884 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
8885 NumElements / Scale);
8886 InputV = ShuffleOffset(InputV);
8888 // For 256-bit vectors, we only need the lower (128-bit) input half.
8889 // For 512-bit vectors, we only need the lower input half or quarter.
8890 if (VT.getSizeInBits() > 128)
8891 InputV = extractSubVector(InputV, 0, DAG, DL,
8892 std::max(128, (int)VT.getSizeInBits() / Scale));
8894 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
8895 return DAG.getBitcast(VT, InputV);
8898 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
8900 // For any extends we can cheat for larger element sizes and use shuffle
8901 // instructions that can fold with a load and/or copy.
8902 if (AnyExt && EltBits == 32) {
8903 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
8905 return DAG.getBitcast(
8906 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8907 DAG.getBitcast(MVT::v4i32, InputV),
8908 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
8910 if (AnyExt && EltBits == 16 && Scale > 2) {
8911 int PSHUFDMask[4] = {Offset / 2, -1,
8912 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
8913 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
8914 DAG.getBitcast(MVT::v4i32, InputV),
8915 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
8916 int PSHUFWMask[4] = {1, -1, -1, -1};
8917 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
8918 return DAG.getBitcast(
8919 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
8920 DAG.getBitcast(MVT::v8i16, InputV),
8921 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
8924 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
8926 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
8927 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
8928 assert(VT.is128BitVector() && "Unexpected vector width!");
8930 int LoIdx = Offset * EltBits;
8931 SDValue Lo = DAG.getBitcast(
8932 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8933 DAG.getConstant(EltBits, DL, MVT::i8),
8934 DAG.getConstant(LoIdx, DL, MVT::i8)));
8936 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
8937 !SafeOffset(Offset + 1))
8938 return DAG.getBitcast(VT, Lo);
8940 int HiIdx = (Offset + 1) * EltBits;
8941 SDValue Hi = DAG.getBitcast(
8942 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
8943 DAG.getConstant(EltBits, DL, MVT::i8),
8944 DAG.getConstant(HiIdx, DL, MVT::i8)));
8945 return DAG.getBitcast(VT,
8946 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
8949 // If this would require more than 2 unpack instructions to expand, use
8950 // pshufb when available. We can only use more than 2 unpack instructions
8951 // when zero extending i8 elements which also makes it easier to use pshufb.
8952 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
8953 assert(NumElements == 16 && "Unexpected byte vector width!");
8954 SDValue PSHUFBMask[16];
8955 for (int i = 0; i < 16; ++i) {
8956 int Idx = Offset + (i / Scale);
8957 PSHUFBMask[i] = DAG.getConstant(
8958 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
8960 InputV = DAG.getBitcast(MVT::v16i8, InputV);
8961 return DAG.getBitcast(
8962 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
8963 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
8966 // If we are extending from an offset, ensure we start on a boundary that
8967 // we can unpack from.
8968 int AlignToUnpack = Offset % (NumElements / Scale);
8969 if (AlignToUnpack) {
8970 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
8971 for (int i = AlignToUnpack; i < NumElements; ++i)
8972 ShMask[i - AlignToUnpack] = i;
8973 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
8974 Offset -= AlignToUnpack;
8977 // Otherwise emit a sequence of unpacks.
8979 unsigned UnpackLoHi = X86ISD::UNPCKL;
8980 if (Offset >= (NumElements / 2)) {
8981 UnpackLoHi = X86ISD::UNPCKH;
8982 Offset -= (NumElements / 2);
8985 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
8986 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
8987 : getZeroVector(InputVT, Subtarget, DAG, DL);
8988 InputV = DAG.getBitcast(InputVT, InputV);
8989 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
8993 } while (Scale > 1);
8994 return DAG.getBitcast(VT, InputV);
8997 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
8999 /// This routine will try to do everything in its power to cleverly lower
9000 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9001 /// check for the profitability of this lowering, it tries to aggressively
9002 /// match this pattern. It will use all of the micro-architectural details it
9003 /// can to emit an efficient lowering. It handles both blends with all-zero
9004 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9005 /// masking out later).
9007 /// The reason we have dedicated lowering for zext-style shuffles is that they
9008 /// are both incredibly common and often quite performance sensitive.
9009 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9010 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9011 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9012 SelectionDAG &DAG) {
9013 int Bits = VT.getSizeInBits();
9014 int NumLanes = Bits / 128;
9015 int NumElements = VT.getVectorNumElements();
9016 int NumEltsPerLane = NumElements / NumLanes;
9017 assert(VT.getScalarSizeInBits() <= 32 &&
9018 "Exceeds 32-bit integer zero extension limit");
9019 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9021 // Define a helper function to check a particular ext-scale and lower to it if
9023 auto Lower = [&](int Scale) -> SDValue {
9028 for (int i = 0; i < NumElements; ++i) {
9031 continue; // Valid anywhere but doesn't tell us anything.
9032 if (i % Scale != 0) {
9033 // Each of the extended elements need to be zeroable.
9037 // We no longer are in the anyext case.
9042 // Each of the base elements needs to be consecutive indices into the
9043 // same input vector.
9044 SDValue V = M < NumElements ? V1 : V2;
9045 M = M % NumElements;
9048 Offset = M - (i / Scale);
9049 } else if (InputV != V)
9050 return SDValue(); // Flip-flopping inputs.
9052 // Offset must start in the lowest 128-bit lane or at the start of an
9054 // FIXME: Is it ever worth allowing a negative base offset?
9055 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9056 (Offset % NumEltsPerLane) == 0))
9059 // If we are offsetting, all referenced entries must come from the same
9061 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9064 if ((M % NumElements) != (Offset + (i / Scale)))
9065 return SDValue(); // Non-consecutive strided elements.
9069 // If we fail to find an input, we have a zero-shuffle which should always
9070 // have already been handled.
9071 // FIXME: Maybe handle this here in case during blending we end up with one?
9075 // If we are offsetting, don't extend if we only match a single input, we
9076 // can always do better by using a basic PSHUF or PUNPCK.
9077 if (Offset != 0 && Matches < 2)
9080 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9081 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9084 // The widest scale possible for extending is to a 64-bit integer.
9085 assert(Bits % 64 == 0 &&
9086 "The number of bits in a vector must be divisible by 64 on x86!");
9087 int NumExtElements = Bits / 64;
9089 // Each iteration, try extending the elements half as much, but into twice as
9091 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9092 assert(NumElements % NumExtElements == 0 &&
9093 "The input vector size must be divisible by the extended size.");
9094 if (SDValue V = Lower(NumElements / NumExtElements))
9098 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9102 // Returns one of the source operands if the shuffle can be reduced to a
9103 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9104 auto CanZExtLowHalf = [&]() {
9105 for (int i = NumElements / 2; i != NumElements; ++i)
9108 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9110 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9115 if (SDValue V = CanZExtLowHalf()) {
9116 V = DAG.getBitcast(MVT::v2i64, V);
9117 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9118 return DAG.getBitcast(VT, V);
9121 // No viable ext lowering found.
9125 /// \brief Try to get a scalar value for a specific element of a vector.
9127 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9128 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9129 SelectionDAG &DAG) {
9130 MVT VT = V.getSimpleValueType();
9131 MVT EltVT = VT.getVectorElementType();
9132 V = peekThroughBitcasts(V);
9134 // If the bitcasts shift the element size, we can't extract an equivalent
9136 MVT NewVT = V.getSimpleValueType();
9137 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9140 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9141 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9142 // Ensure the scalar operand is the same size as the destination.
9143 // FIXME: Add support for scalar truncation where possible.
9144 SDValue S = V.getOperand(Idx);
9145 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9146 return DAG.getBitcast(EltVT, S);
9152 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9154 /// This is particularly important because the set of instructions varies
9155 /// significantly based on whether the operand is a load or not.
9156 static bool isShuffleFoldableLoad(SDValue V) {
9157 V = peekThroughBitcasts(V);
9158 return ISD::isNON_EXTLoad(V.getNode());
9161 /// \brief Try to lower insertion of a single element into a zero vector.
9163 /// This is a common pattern that we have especially efficient patterns to lower
9164 /// across all subtarget feature sets.
9165 static SDValue lowerVectorShuffleAsElementInsertion(
9166 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9167 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9168 SelectionDAG &DAG) {
9170 MVT EltVT = VT.getVectorElementType();
9173 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9175 bool IsV1Zeroable = true;
9176 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9177 if (i != V2Index && !Zeroable[i]) {
9178 IsV1Zeroable = false;
9182 // Check for a single input from a SCALAR_TO_VECTOR node.
9183 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9184 // all the smarts here sunk into that routine. However, the current
9185 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9186 // vector shuffle lowering is dead.
9187 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9189 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9190 // We need to zext the scalar if it is smaller than an i32.
9191 V2S = DAG.getBitcast(EltVT, V2S);
9192 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9193 // Using zext to expand a narrow element won't work for non-zero
9198 // Zero-extend directly to i32.
9200 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9202 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9203 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9204 EltVT == MVT::i16) {
9205 // Either not inserting from the low element of the input or the input
9206 // element size is too small to use VZEXT_MOVL to clear the high bits.
9210 if (!IsV1Zeroable) {
9211 // If V1 can't be treated as a zero vector we have fewer options to lower
9212 // this. We can't support integer vectors or non-zero targets cheaply, and
9213 // the V1 elements can't be permuted in any way.
9214 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9215 if (!VT.isFloatingPoint() || V2Index != 0)
9217 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9218 V1Mask[V2Index] = -1;
9219 if (!isNoopShuffleMask(V1Mask))
9221 // This is essentially a special case blend operation, but if we have
9222 // general purpose blend operations, they are always faster. Bail and let
9223 // the rest of the lowering handle these as blends.
9224 if (Subtarget.hasSSE41())
9227 // Otherwise, use MOVSD or MOVSS.
9228 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9229 "Only two types of floating point element types to handle!");
9230 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9234 // This lowering only works for the low element with floating point vectors.
9235 if (VT.isFloatingPoint() && V2Index != 0)
9238 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9240 V2 = DAG.getBitcast(VT, V2);
9243 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9244 // the desired position. Otherwise it is more efficient to do a vector
9245 // shift left. We know that we can do a vector shift left because all
9246 // the inputs are zero.
9247 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9248 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9249 V2Shuffle[V2Index] = 0;
9250 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9252 V2 = DAG.getBitcast(MVT::v16i8, V2);
9254 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9255 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9256 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9257 DAG.getDataLayout(), VT)));
9258 V2 = DAG.getBitcast(VT, V2);
9264 /// Try to lower broadcast of a single - truncated - integer element,
9265 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9267 /// This assumes we have AVX2.
9268 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9269 SDValue V0, int BroadcastIdx,
9270 const X86Subtarget &Subtarget,
9271 SelectionDAG &DAG) {
9272 assert(Subtarget.hasAVX2() &&
9273 "We can only lower integer broadcasts with AVX2!");
9275 EVT EltVT = VT.getVectorElementType();
9276 EVT V0VT = V0.getValueType();
9278 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9279 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9281 EVT V0EltVT = V0VT.getVectorElementType();
9282 if (!V0EltVT.isInteger())
9285 const unsigned EltSize = EltVT.getSizeInBits();
9286 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9288 // This is only a truncation if the original element type is larger.
9289 if (V0EltSize <= EltSize)
9292 assert(((V0EltSize % EltSize) == 0) &&
9293 "Scalar type sizes must all be powers of 2 on x86!");
9295 const unsigned V0Opc = V0.getOpcode();
9296 const unsigned Scale = V0EltSize / EltSize;
9297 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9299 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9300 V0Opc != ISD::BUILD_VECTOR)
9303 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9305 // If we're extracting non-least-significant bits, shift so we can truncate.
9306 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9307 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9308 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9309 if (const int OffsetIdx = BroadcastIdx % Scale)
9310 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9311 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9313 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9314 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9317 /// \brief Try to lower broadcast of a single element.
9319 /// For convenience, this code also bundles all of the subtarget feature set
9320 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9321 /// a convenient way to factor it out.
9322 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9323 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9324 SDValue V1, SDValue V2,
9326 const X86Subtarget &Subtarget,
9327 SelectionDAG &DAG) {
9328 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9329 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9330 (Subtarget.hasAVX2() && VT.isInteger())))
9333 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9334 // we can only broadcast from a register with AVX2.
9335 unsigned NumElts = Mask.size();
9336 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9337 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9339 // Check that the mask is a broadcast.
9340 int BroadcastIdx = -1;
9341 for (int i = 0; i != (int)NumElts; ++i) {
9342 SmallVector<int, 8> BroadcastMask(NumElts, i);
9343 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9349 if (BroadcastIdx < 0)
9351 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9352 "a sorted mask where the broadcast "
9355 // Go up the chain of (vector) values to find a scalar load that we can
9356 // combine with the broadcast.
9359 switch (V.getOpcode()) {
9360 case ISD::BITCAST: {
9361 SDValue VSrc = V.getOperand(0);
9362 MVT SrcVT = VSrc.getSimpleValueType();
9363 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9368 case ISD::CONCAT_VECTORS: {
9369 int OperandSize = Mask.size() / V.getNumOperands();
9370 V = V.getOperand(BroadcastIdx / OperandSize);
9371 BroadcastIdx %= OperandSize;
9374 case ISD::INSERT_SUBVECTOR: {
9375 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9376 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9380 int BeginIdx = (int)ConstantIdx->getZExtValue();
9382 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9383 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9384 BroadcastIdx -= BeginIdx;
9395 // Check if this is a broadcast of a scalar. We special case lowering
9396 // for scalars so that we can more effectively fold with loads.
9397 // First, look through bitcast: if the original value has a larger element
9398 // type than the shuffle, the broadcast element is in essence truncated.
9399 // Make that explicit to ease folding.
9400 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9401 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9402 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9403 return TruncBroadcast;
9405 MVT BroadcastVT = VT;
9407 // Peek through any bitcast (only useful for loads).
9408 SDValue BC = peekThroughBitcasts(V);
9410 // Also check the simpler case, where we can directly reuse the scalar.
9411 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9412 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9413 V = V.getOperand(BroadcastIdx);
9415 // If we can't broadcast from a register, check that the input is a load.
9416 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9418 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9419 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9420 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9421 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9422 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9425 // If we are broadcasting a load that is only used by the shuffle
9426 // then we can reduce the vector load to the broadcasted scalar load.
9427 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9428 SDValue BaseAddr = Ld->getOperand(1);
9429 EVT SVT = BroadcastVT.getScalarType();
9430 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9431 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9432 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9433 DAG.getMachineFunction().getMachineMemOperand(
9434 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9436 // Make sure the newly-created LOAD is in the same position as Ld in
9437 // terms of dependency. We create a TokenFactor for Ld and V,
9438 // and update uses of Ld's output chain to use the TokenFactor.
9439 if (Ld->hasAnyUseOfValue(1)) {
9440 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9441 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9442 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9443 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9444 SDValue(V.getNode(), 1));
9446 } else if (!BroadcastFromReg) {
9447 // We can't broadcast from a vector register.
9449 } else if (BroadcastIdx != 0) {
9450 // We can only broadcast from the zero-element of a vector register,
9451 // but it can be advantageous to broadcast from the zero-element of a
9453 if (!VT.is256BitVector() && !VT.is512BitVector())
9456 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9457 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9460 // Only broadcast the zero-element of a 128-bit subvector.
9461 unsigned EltSize = VT.getScalarSizeInBits();
9462 if (((BroadcastIdx * EltSize) % 128) != 0)
9465 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
9466 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9467 DAG.getIntPtrConstant(BroadcastIdx, DL));
9470 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9471 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9472 DAG.getBitcast(MVT::f64, V));
9474 // Bitcast back to the same scalar type as BroadcastVT.
9475 MVT SrcVT = V.getSimpleValueType();
9476 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9477 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9478 "Unexpected vector element size");
9479 if (SrcVT.isVector()) {
9480 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9481 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9483 SrcVT = BroadcastVT.getScalarType();
9485 V = DAG.getBitcast(SrcVT, V);
9488 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9489 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9490 V = DAG.getBitcast(MVT::f64, V);
9491 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9492 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9495 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9498 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9499 // INSERTPS when the V1 elements are already in the correct locations
9500 // because otherwise we can just always use two SHUFPS instructions which
9501 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9502 // perform INSERTPS if a single V1 element is out of place and all V2
9503 // elements are zeroable.
9504 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9505 unsigned &InsertPSMask,
9506 const SmallBitVector &Zeroable,
9508 SelectionDAG &DAG) {
9509 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9510 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9511 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9513 // Attempt to match INSERTPS with one element from VA or VB being
9514 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9516 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9517 ArrayRef<int> CandidateMask) {
9519 int VADstIndex = -1;
9520 int VBDstIndex = -1;
9521 bool VAUsedInPlace = false;
9523 for (int i = 0; i < 4; ++i) {
9524 // Synthesize a zero mask from the zeroable elements (includes undefs).
9530 // Flag if we use any VA inputs in place.
9531 if (i == CandidateMask[i]) {
9532 VAUsedInPlace = true;
9536 // We can only insert a single non-zeroable element.
9537 if (VADstIndex >= 0 || VBDstIndex >= 0)
9540 if (CandidateMask[i] < 4) {
9541 // VA input out of place for insertion.
9544 // VB input for insertion.
9549 // Don't bother if we have no (non-zeroable) element for insertion.
9550 if (VADstIndex < 0 && VBDstIndex < 0)
9553 // Determine element insertion src/dst indices. The src index is from the
9554 // start of the inserted vector, not the start of the concatenated vector.
9555 unsigned VBSrcIndex = 0;
9556 if (VADstIndex >= 0) {
9557 // If we have a VA input out of place, we use VA as the V2 element
9558 // insertion and don't use the original V2 at all.
9559 VBSrcIndex = CandidateMask[VADstIndex];
9560 VBDstIndex = VADstIndex;
9563 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
9566 // If no V1 inputs are used in place, then the result is created only from
9567 // the zero mask and the V2 insertion - so remove V1 dependency.
9569 VA = DAG.getUNDEF(MVT::v4f32);
9571 // Update V1, V2 and InsertPSMask accordingly.
9575 // Insert the V2 element into the desired position.
9576 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
9577 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
9581 if (matchAsInsertPS(V1, V2, Mask))
9584 // Commute and try again.
9585 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
9586 ShuffleVectorSDNode::commuteMask(CommutedMask);
9587 if (matchAsInsertPS(V2, V1, CommutedMask))
9593 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
9594 SDValue V2, ArrayRef<int> Mask,
9595 const SmallBitVector &Zeroable,
9596 SelectionDAG &DAG) {
9597 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9598 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9600 // Attempt to match the insertps pattern.
9601 unsigned InsertPSMask;
9602 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
9605 // Insert the V2 element into the desired position.
9606 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9607 DAG.getConstant(InsertPSMask, DL, MVT::i8));
9610 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
9611 /// UNPCK instruction.
9613 /// This specifically targets cases where we end up with alternating between
9614 /// the two inputs, and so can permute them into something that feeds a single
9615 /// UNPCK instruction. Note that this routine only targets integer vectors
9616 /// because for floating point vectors we have a generalized SHUFPS lowering
9617 /// strategy that handles everything that doesn't *exactly* match an unpack,
9618 /// making this clever lowering unnecessary.
9619 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
9620 SDValue V1, SDValue V2,
9622 SelectionDAG &DAG) {
9623 assert(!VT.isFloatingPoint() &&
9624 "This routine only supports integer vectors.");
9625 assert(VT.is128BitVector() &&
9626 "This routine only works on 128-bit vectors.");
9627 assert(!V2.isUndef() &&
9628 "This routine should only be used when blending two inputs.");
9629 assert(Mask.size() >= 2 && "Single element masks are invalid.");
9631 int Size = Mask.size();
9634 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
9636 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
9638 bool UnpackLo = NumLoInputs >= NumHiInputs;
9640 auto TryUnpack = [&](int ScalarSize, int Scale) {
9641 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
9642 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
9644 for (int i = 0; i < Size; ++i) {
9648 // Each element of the unpack contains Scale elements from this mask.
9649 int UnpackIdx = i / Scale;
9651 // We only handle the case where V1 feeds the first slots of the unpack.
9652 // We rely on canonicalization to ensure this is the case.
9653 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
9656 // Setup the mask for this input. The indexing is tricky as we have to
9657 // handle the unpack stride.
9658 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
9659 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
9663 // If we will have to shuffle both inputs to use the unpack, check whether
9664 // we can just unpack first and shuffle the result. If so, skip this unpack.
9665 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
9666 !isNoopShuffleMask(V2Mask))
9669 // Shuffle the inputs into place.
9670 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9671 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9673 // Cast the inputs to the type we will use to unpack them.
9674 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
9675 V1 = DAG.getBitcast(UnpackVT, V1);
9676 V2 = DAG.getBitcast(UnpackVT, V2);
9678 // Unpack the inputs and cast the result back to the desired type.
9679 return DAG.getBitcast(
9680 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9684 // We try each unpack from the largest to the smallest to try and find one
9685 // that fits this mask.
9686 int OrigScalarSize = VT.getScalarSizeInBits();
9687 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
9688 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
9691 // If none of the unpack-rooted lowerings worked (or were profitable) try an
9693 if (NumLoInputs == 0 || NumHiInputs == 0) {
9694 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
9695 "We have to have *some* inputs!");
9696 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
9698 // FIXME: We could consider the total complexity of the permute of each
9699 // possible unpacking. Or at the least we should consider how many
9700 // half-crossings are created.
9701 // FIXME: We could consider commuting the unpacks.
9703 SmallVector<int, 32> PermMask((unsigned)Size, -1);
9704 for (int i = 0; i < Size; ++i) {
9708 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
9711 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
9713 return DAG.getVectorShuffle(
9714 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
9716 DAG.getUNDEF(VT), PermMask);
9722 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
9724 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
9725 /// support for floating point shuffles but not integer shuffles. These
9726 /// instructions will incur a domain crossing penalty on some chips though so
9727 /// it is better to avoid lowering through this for integer vectors where
9729 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9730 const SmallBitVector &Zeroable,
9731 SDValue V1, SDValue V2,
9732 const X86Subtarget &Subtarget,
9733 SelectionDAG &DAG) {
9734 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9735 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9736 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9739 // Check for being able to broadcast a single element.
9740 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9741 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
9744 // Straight shuffle of a single input vector. Simulate this by using the
9745 // single input as both of the "inputs" to this instruction..
9746 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
9748 if (Subtarget.hasAVX()) {
9749 // If we have AVX, we can use VPERMILPS which will allow folding a load
9750 // into the shuffle.
9751 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
9752 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9756 X86ISD::SHUFP, DL, MVT::v2f64,
9757 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9758 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9759 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9761 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
9762 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
9764 // If we have a single input, insert that into V1 if we can do so cheaply.
9765 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
9766 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9767 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9769 // Try inverting the insertion since for v2 masks it is easy to do and we
9770 // can't reliably sort the mask one way or the other.
9771 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
9772 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
9773 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9774 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9778 // Try to use one of the special instruction patterns to handle two common
9779 // blend patterns if a zero-blend above didn't work.
9780 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
9781 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
9782 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
9783 // We can either use a special instruction to load over the low double or
9784 // to move just the low double.
9786 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
9788 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
9790 if (Subtarget.hasSSE41())
9791 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
9792 Zeroable, Subtarget, DAG))
9795 // Use dedicated unpack instructions for masks that match their pattern.
9797 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
9800 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
9801 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
9802 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9805 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
9807 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
9808 /// the integer unit to minimize domain crossing penalties. However, for blends
9809 /// it falls back to the floating point shuffle operation with appropriate bit
9811 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9812 const SmallBitVector &Zeroable,
9813 SDValue V1, SDValue V2,
9814 const X86Subtarget &Subtarget,
9815 SelectionDAG &DAG) {
9816 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9817 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9818 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9821 // Check for being able to broadcast a single element.
9822 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9823 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9826 // Straight shuffle of a single input vector. For everything from SSE2
9827 // onward this has a single fast instruction with no scary immediates.
9828 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9829 V1 = DAG.getBitcast(MVT::v4i32, V1);
9830 int WidenedMask[4] = {
9831 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9832 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9833 return DAG.getBitcast(
9835 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9836 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9838 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9839 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9840 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9841 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9843 // If we have a blend of two same-type PACKUS operations and the blend aligns
9844 // with the low and high halves, we can just merge the PACKUS operations.
9845 // This is particularly important as it lets us merge shuffles that this
9846 // routine itself creates.
9847 auto GetPackNode = [](SDValue V) {
9848 V = peekThroughBitcasts(V);
9849 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
9851 if (SDValue V1Pack = GetPackNode(V1))
9852 if (SDValue V2Pack = GetPackNode(V2)) {
9853 EVT PackVT = V1Pack.getValueType();
9854 if (PackVT == V2Pack.getValueType())
9855 return DAG.getBitcast(MVT::v2i64,
9856 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
9857 Mask[0] == 0 ? V1Pack.getOperand(0)
9858 : V1Pack.getOperand(1),
9859 Mask[1] == 2 ? V2Pack.getOperand(0)
9860 : V2Pack.getOperand(1)));
9863 // Try to use shift instructions.
9864 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
9865 Zeroable, Subtarget, DAG))
9868 // When loading a scalar and then shuffling it into a vector we can often do
9869 // the insertion cheaply.
9870 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9871 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9873 // Try inverting the insertion since for v2 masks it is easy to do and we
9874 // can't reliably sort the mask one way or the other.
9875 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
9876 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9877 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9880 // We have different paths for blend lowering, but they all must use the
9881 // *exact* same predicate.
9882 bool IsBlendSupported = Subtarget.hasSSE41();
9883 if (IsBlendSupported)
9884 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
9885 Zeroable, Subtarget, DAG))
9888 // Use dedicated unpack instructions for masks that match their pattern.
9890 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
9893 // Try to use byte rotation instructions.
9894 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
9895 if (Subtarget.hasSSSE3())
9896 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
9897 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9900 // If we have direct support for blends, we should lower by decomposing into
9901 // a permute. That will be faster than the domain cross.
9902 if (IsBlendSupported)
9903 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
9906 // We implement this with SHUFPD which is pretty lame because it will likely
9907 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
9908 // However, all the alternatives are still more cycles and newer chips don't
9909 // have this problem. It would be really nice if x86 had better shuffles here.
9910 V1 = DAG.getBitcast(MVT::v2f64, V1);
9911 V2 = DAG.getBitcast(MVT::v2f64, V2);
9912 return DAG.getBitcast(MVT::v2i64,
9913 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
9916 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
9918 /// This is used to disable more specialized lowerings when the shufps lowering
9919 /// will happen to be efficient.
9920 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
9921 // This routine only handles 128-bit shufps.
9922 assert(Mask.size() == 4 && "Unsupported mask size!");
9923 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
9924 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
9925 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
9926 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
9928 // To lower with a single SHUFPS we need to have the low half and high half
9929 // each requiring a single input.
9930 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
9932 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
9938 /// \brief Lower a vector shuffle using the SHUFPS instruction.
9940 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
9941 /// It makes no assumptions about whether this is the *best* lowering, it simply
9943 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
9944 ArrayRef<int> Mask, SDValue V1,
9945 SDValue V2, SelectionDAG &DAG) {
9946 SDValue LowV = V1, HighV = V2;
9947 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
9949 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
9951 if (NumV2Elements == 1) {
9952 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
9954 // Compute the index adjacent to V2Index and in the same half by toggling
9956 int V2AdjIndex = V2Index ^ 1;
9958 if (Mask[V2AdjIndex] < 0) {
9959 // Handles all the cases where we have a single V2 element and an undef.
9960 // This will only ever happen in the high lanes because we commute the
9961 // vector otherwise.
9963 std::swap(LowV, HighV);
9964 NewMask[V2Index] -= 4;
9966 // Handle the case where the V2 element ends up adjacent to a V1 element.
9967 // To make this work, blend them together as the first step.
9968 int V1Index = V2AdjIndex;
9969 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
9970 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
9971 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
9973 // Now proceed to reconstruct the final blend as we have the necessary
9974 // high or low half formed.
9981 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
9982 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
9984 } else if (NumV2Elements == 2) {
9985 if (Mask[0] < 4 && Mask[1] < 4) {
9986 // Handle the easy case where we have V1 in the low lanes and V2 in the
9990 } else if (Mask[2] < 4 && Mask[3] < 4) {
9991 // We also handle the reversed case because this utility may get called
9992 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
9993 // arrange things in the right direction.
9999 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10000 // trying to place elements directly, just blend them and set up the final
10001 // shuffle to place them.
10003 // The first two blend mask elements are for V1, the second two are for
10005 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10006 Mask[2] < 4 ? Mask[2] : Mask[3],
10007 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10008 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10009 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10010 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10012 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10015 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10016 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10017 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10018 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10021 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10022 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10025 /// \brief Lower 4-lane 32-bit floating point shuffles.
10027 /// Uses instructions exclusively from the floating point unit to minimize
10028 /// domain crossing penalties, as these are sufficient to implement all v4f32
10030 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10031 const SmallBitVector &Zeroable,
10032 SDValue V1, SDValue V2,
10033 const X86Subtarget &Subtarget,
10034 SelectionDAG &DAG) {
10035 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10036 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10037 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10039 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10041 if (NumV2Elements == 0) {
10042 // Check for being able to broadcast a single element.
10043 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10044 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10047 // Use even/odd duplicate instructions for masks that match their pattern.
10048 if (Subtarget.hasSSE3()) {
10049 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10050 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10051 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10052 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10055 if (Subtarget.hasAVX()) {
10056 // If we have AVX, we can use VPERMILPS which will allow folding a load
10057 // into the shuffle.
10058 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10059 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10062 // Otherwise, use a straight shuffle of a single input vector. We pass the
10063 // input vector to both operands to simulate this with a SHUFPS.
10064 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10065 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10068 // There are special ways we can lower some single-element blends. However, we
10069 // have custom ways we can lower more complex single-element blends below that
10070 // we defer to if both this and BLENDPS fail to match, so restrict this to
10071 // when the V2 input is targeting element 0 of the mask -- that is the fast
10073 if (NumV2Elements == 1 && Mask[0] >= 4)
10074 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10075 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10078 if (Subtarget.hasSSE41()) {
10079 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10080 Zeroable, Subtarget, DAG))
10083 // Use INSERTPS if we can complete the shuffle efficiently.
10085 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10088 if (!isSingleSHUFPSMask(Mask))
10089 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10090 DL, MVT::v4f32, V1, V2, Mask, DAG))
10094 // Use low/high mov instructions.
10095 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10096 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10097 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10098 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10100 // Use dedicated unpack instructions for masks that match their pattern.
10102 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10105 // Otherwise fall back to a SHUFPS lowering strategy.
10106 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10109 /// \brief Lower 4-lane i32 vector shuffles.
10111 /// We try to handle these with integer-domain shuffles where we can, but for
10112 /// blends we use the floating point domain blend instructions.
10113 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10114 const SmallBitVector &Zeroable,
10115 SDValue V1, SDValue V2,
10116 const X86Subtarget &Subtarget,
10117 SelectionDAG &DAG) {
10118 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10119 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10120 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10122 // Whenever we can lower this as a zext, that instruction is strictly faster
10123 // than any alternative. It also allows us to fold memory operands into the
10124 // shuffle in many cases.
10125 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10126 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10129 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10131 if (NumV2Elements == 0) {
10132 // Check for being able to broadcast a single element.
10133 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10134 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10137 // Straight shuffle of a single input vector. For everything from SSE2
10138 // onward this has a single fast instruction with no scary immediates.
10139 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10140 // but we aren't actually going to use the UNPCK instruction because doing
10141 // so prevents folding a load into this instruction or making a copy.
10142 const int UnpackLoMask[] = {0, 0, 1, 1};
10143 const int UnpackHiMask[] = {2, 2, 3, 3};
10144 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10145 Mask = UnpackLoMask;
10146 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10147 Mask = UnpackHiMask;
10149 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10150 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10153 // Try to use shift instructions.
10154 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10155 Zeroable, Subtarget, DAG))
10158 // There are special ways we can lower some single-element blends.
10159 if (NumV2Elements == 1)
10160 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10161 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10164 // We have different paths for blend lowering, but they all must use the
10165 // *exact* same predicate.
10166 bool IsBlendSupported = Subtarget.hasSSE41();
10167 if (IsBlendSupported)
10168 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10169 Zeroable, Subtarget, DAG))
10172 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10176 // Use dedicated unpack instructions for masks that match their pattern.
10178 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10181 // Try to use byte rotation instructions.
10182 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10183 if (Subtarget.hasSSSE3())
10184 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10185 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10188 // Assume that a single SHUFPS is faster than an alternative sequence of
10189 // multiple instructions (even if the CPU has a domain penalty).
10190 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10191 if (!isSingleSHUFPSMask(Mask)) {
10192 // If we have direct support for blends, we should lower by decomposing into
10193 // a permute. That will be faster than the domain cross.
10194 if (IsBlendSupported)
10195 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10198 // Try to lower by permuting the inputs into an unpack instruction.
10199 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10200 DL, MVT::v4i32, V1, V2, Mask, DAG))
10204 // We implement this with SHUFPS because it can blend from two vectors.
10205 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10206 // up the inputs, bypassing domain shift penalties that we would encur if we
10207 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10209 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10210 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10211 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10212 return DAG.getBitcast(MVT::v4i32, ShufPS);
10215 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10216 /// shuffle lowering, and the most complex part.
10218 /// The lowering strategy is to try to form pairs of input lanes which are
10219 /// targeted at the same half of the final vector, and then use a dword shuffle
10220 /// to place them onto the right half, and finally unpack the paired lanes into
10221 /// their final position.
10223 /// The exact breakdown of how to form these dword pairs and align them on the
10224 /// correct sides is really tricky. See the comments within the function for
10225 /// more of the details.
10227 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10228 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10229 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10230 /// vector, form the analogous 128-bit 8-element Mask.
10231 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10232 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10233 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10234 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10235 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10237 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
10238 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10239 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10241 SmallVector<int, 4> LoInputs;
10242 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
10243 [](int M) { return M >= 0; });
10244 std::sort(LoInputs.begin(), LoInputs.end());
10245 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10246 SmallVector<int, 4> HiInputs;
10247 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
10248 [](int M) { return M >= 0; });
10249 std::sort(HiInputs.begin(), HiInputs.end());
10250 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10252 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10253 int NumHToL = LoInputs.size() - NumLToL;
10255 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10256 int NumHToH = HiInputs.size() - NumLToH;
10257 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10258 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10259 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10260 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10262 // If we are splatting two values from one half - one to each half, then
10263 // we can shuffle that half so each is splatted to a dword, then splat those
10264 // to their respective halves.
10265 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10267 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10268 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10269 V = DAG.getNode(ShufWOp, DL, VT, V,
10270 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10271 V = DAG.getBitcast(PSHUFDVT, V);
10272 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10273 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10274 return DAG.getBitcast(VT, V);
10277 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10278 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10279 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10280 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10282 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10283 // such inputs we can swap two of the dwords across the half mark and end up
10284 // with <=2 inputs to each half in each half. Once there, we can fall through
10285 // to the generic code below. For example:
10287 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10288 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10290 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10291 // and an existing 2-into-2 on the other half. In this case we may have to
10292 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10293 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10294 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10295 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10296 // half than the one we target for fixing) will be fixed when we re-enter this
10297 // path. We will also combine away any sequence of PSHUFD instructions that
10298 // result into a single instruction. Here is an example of the tricky case:
10300 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10301 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10303 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10305 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10306 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10308 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10309 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10311 // The result is fine to be handled by the generic logic.
10312 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10313 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10314 int AOffset, int BOffset) {
10315 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10316 "Must call this with A having 3 or 1 inputs from the A half.");
10317 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10318 "Must call this with B having 1 or 3 inputs from the B half.");
10319 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10320 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10322 bool ThreeAInputs = AToAInputs.size() == 3;
10324 // Compute the index of dword with only one word among the three inputs in
10325 // a half by taking the sum of the half with three inputs and subtracting
10326 // the sum of the actual three inputs. The difference is the remaining
10328 int ADWord, BDWord;
10329 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10330 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10331 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10332 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10333 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10334 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10335 int TripleNonInputIdx =
10336 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10337 TripleDWord = TripleNonInputIdx / 2;
10339 // We use xor with one to compute the adjacent DWord to whichever one the
10341 OneInputDWord = (OneInput / 2) ^ 1;
10343 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10344 // and BToA inputs. If there is also such a problem with the BToB and AToB
10345 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10346 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10347 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10348 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10349 // Compute how many inputs will be flipped by swapping these DWords. We
10351 // to balance this to ensure we don't form a 3-1 shuffle in the other
10353 int NumFlippedAToBInputs =
10354 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10355 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10356 int NumFlippedBToBInputs =
10357 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10358 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10359 if ((NumFlippedAToBInputs == 1 &&
10360 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10361 (NumFlippedBToBInputs == 1 &&
10362 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10363 // We choose whether to fix the A half or B half based on whether that
10364 // half has zero flipped inputs. At zero, we may not be able to fix it
10365 // with that half. We also bias towards fixing the B half because that
10366 // will more commonly be the high half, and we have to bias one way.
10367 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10368 ArrayRef<int> Inputs) {
10369 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10370 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10371 // Determine whether the free index is in the flipped dword or the
10372 // unflipped dword based on where the pinned index is. We use this bit
10373 // in an xor to conditionally select the adjacent dword.
10374 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10375 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10376 if (IsFixIdxInput == IsFixFreeIdxInput)
10378 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10379 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10380 "We need to be changing the number of flipped inputs!");
10381 int PSHUFHalfMask[] = {0, 1, 2, 3};
10382 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10383 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10385 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10387 for (int &M : Mask)
10388 if (M >= 0 && M == FixIdx)
10390 else if (M >= 0 && M == FixFreeIdx)
10393 if (NumFlippedBToBInputs != 0) {
10395 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10396 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10398 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10399 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10400 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10405 int PSHUFDMask[] = {0, 1, 2, 3};
10406 PSHUFDMask[ADWord] = BDWord;
10407 PSHUFDMask[BDWord] = ADWord;
10408 V = DAG.getBitcast(
10410 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10411 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10413 // Adjust the mask to match the new locations of A and B.
10414 for (int &M : Mask)
10415 if (M >= 0 && M/2 == ADWord)
10416 M = 2 * BDWord + M % 2;
10417 else if (M >= 0 && M/2 == BDWord)
10418 M = 2 * ADWord + M % 2;
10420 // Recurse back into this routine to re-compute state now that this isn't
10421 // a 3 and 1 problem.
10422 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10425 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10426 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10427 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10428 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10430 // At this point there are at most two inputs to the low and high halves from
10431 // each half. That means the inputs can always be grouped into dwords and
10432 // those dwords can then be moved to the correct half with a dword shuffle.
10433 // We use at most one low and one high word shuffle to collect these paired
10434 // inputs into dwords, and finally a dword shuffle to place them.
10435 int PSHUFLMask[4] = {-1, -1, -1, -1};
10436 int PSHUFHMask[4] = {-1, -1, -1, -1};
10437 int PSHUFDMask[4] = {-1, -1, -1, -1};
10439 // First fix the masks for all the inputs that are staying in their
10440 // original halves. This will then dictate the targets of the cross-half
10442 auto fixInPlaceInputs =
10443 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10444 MutableArrayRef<int> SourceHalfMask,
10445 MutableArrayRef<int> HalfMask, int HalfOffset) {
10446 if (InPlaceInputs.empty())
10448 if (InPlaceInputs.size() == 1) {
10449 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10450 InPlaceInputs[0] - HalfOffset;
10451 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10454 if (IncomingInputs.empty()) {
10455 // Just fix all of the in place inputs.
10456 for (int Input : InPlaceInputs) {
10457 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10458 PSHUFDMask[Input / 2] = Input / 2;
10463 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10464 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10465 InPlaceInputs[0] - HalfOffset;
10466 // Put the second input next to the first so that they are packed into
10467 // a dword. We find the adjacent index by toggling the low bit.
10468 int AdjIndex = InPlaceInputs[0] ^ 1;
10469 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10470 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10471 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10473 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10474 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10476 // Now gather the cross-half inputs and place them into a free dword of
10477 // their target half.
10478 // FIXME: This operation could almost certainly be simplified dramatically to
10479 // look more like the 3-1 fixing operation.
10480 auto moveInputsToRightHalf = [&PSHUFDMask](
10481 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10482 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10483 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10485 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10486 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10488 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10490 int LowWord = Word & ~1;
10491 int HighWord = Word | 1;
10492 return isWordClobbered(SourceHalfMask, LowWord) ||
10493 isWordClobbered(SourceHalfMask, HighWord);
10496 if (IncomingInputs.empty())
10499 if (ExistingInputs.empty()) {
10500 // Map any dwords with inputs from them into the right half.
10501 for (int Input : IncomingInputs) {
10502 // If the source half mask maps over the inputs, turn those into
10503 // swaps and use the swapped lane.
10504 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10505 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10506 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10507 Input - SourceOffset;
10508 // We have to swap the uses in our half mask in one sweep.
10509 for (int &M : HalfMask)
10510 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10512 else if (M == Input)
10513 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10515 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10516 Input - SourceOffset &&
10517 "Previous placement doesn't match!");
10519 // Note that this correctly re-maps both when we do a swap and when
10520 // we observe the other side of the swap above. We rely on that to
10521 // avoid swapping the members of the input list directly.
10522 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10525 // Map the input's dword into the correct half.
10526 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10527 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10529 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10531 "Previous placement doesn't match!");
10534 // And just directly shift any other-half mask elements to be same-half
10535 // as we will have mirrored the dword containing the element into the
10536 // same position within that half.
10537 for (int &M : HalfMask)
10538 if (M >= SourceOffset && M < SourceOffset + 4) {
10539 M = M - SourceOffset + DestOffset;
10540 assert(M >= 0 && "This should never wrap below zero!");
10545 // Ensure we have the input in a viable dword of its current half. This
10546 // is particularly tricky because the original position may be clobbered
10547 // by inputs being moved and *staying* in that half.
10548 if (IncomingInputs.size() == 1) {
10549 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10550 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
10552 SourceHalfMask[InputFixed - SourceOffset] =
10553 IncomingInputs[0] - SourceOffset;
10554 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
10556 IncomingInputs[0] = InputFixed;
10558 } else if (IncomingInputs.size() == 2) {
10559 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
10560 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10561 // We have two non-adjacent or clobbered inputs we need to extract from
10562 // the source half. To do this, we need to map them into some adjacent
10563 // dword slot in the source mask.
10564 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
10565 IncomingInputs[1] - SourceOffset};
10567 // If there is a free slot in the source half mask adjacent to one of
10568 // the inputs, place the other input in it. We use (Index XOR 1) to
10569 // compute an adjacent index.
10570 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
10571 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
10572 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
10573 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10574 InputsFixed[1] = InputsFixed[0] ^ 1;
10575 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
10576 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
10577 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
10578 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
10579 InputsFixed[0] = InputsFixed[1] ^ 1;
10580 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
10581 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
10582 // The two inputs are in the same DWord but it is clobbered and the
10583 // adjacent DWord isn't used at all. Move both inputs to the free
10585 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
10586 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
10587 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
10588 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
10590 // The only way we hit this point is if there is no clobbering
10591 // (because there are no off-half inputs to this half) and there is no
10592 // free slot adjacent to one of the inputs. In this case, we have to
10593 // swap an input with a non-input.
10594 for (int i = 0; i < 4; ++i)
10595 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
10596 "We can't handle any clobbers here!");
10597 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
10598 "Cannot have adjacent inputs here!");
10600 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10601 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
10603 // We also have to update the final source mask in this case because
10604 // it may need to undo the above swap.
10605 for (int &M : FinalSourceHalfMask)
10606 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
10607 M = InputsFixed[1] + SourceOffset;
10608 else if (M == InputsFixed[1] + SourceOffset)
10609 M = (InputsFixed[0] ^ 1) + SourceOffset;
10611 InputsFixed[1] = InputsFixed[0] ^ 1;
10614 // Point everything at the fixed inputs.
10615 for (int &M : HalfMask)
10616 if (M == IncomingInputs[0])
10617 M = InputsFixed[0] + SourceOffset;
10618 else if (M == IncomingInputs[1])
10619 M = InputsFixed[1] + SourceOffset;
10621 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
10622 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
10625 llvm_unreachable("Unhandled input size!");
10628 // Now hoist the DWord down to the right half.
10629 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
10630 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
10631 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
10632 for (int &M : HalfMask)
10633 for (int Input : IncomingInputs)
10635 M = FreeDWord * 2 + Input % 2;
10637 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
10638 /*SourceOffset*/ 4, /*DestOffset*/ 0);
10639 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
10640 /*SourceOffset*/ 0, /*DestOffset*/ 4);
10642 // Now enact all the shuffles we've computed to move the inputs into their
10644 if (!isNoopShuffleMask(PSHUFLMask))
10645 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10646 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
10647 if (!isNoopShuffleMask(PSHUFHMask))
10648 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10649 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
10650 if (!isNoopShuffleMask(PSHUFDMask))
10651 V = DAG.getBitcast(
10653 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10654 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10656 // At this point, each half should contain all its inputs, and we can then
10657 // just shuffle them into their final position.
10658 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
10659 "Failed to lift all the high half inputs to the low mask!");
10660 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
10661 "Failed to lift all the low half inputs to the high mask!");
10663 // Do a half shuffle for the low mask.
10664 if (!isNoopShuffleMask(LoMask))
10665 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10666 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
10668 // Do a half shuffle with the high mask after shifting its values down.
10669 for (int &M : HiMask)
10672 if (!isNoopShuffleMask(HiMask))
10673 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10674 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
10679 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
10680 /// blend if only one input is used.
10681 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
10682 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10683 const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
10685 SDValue V1Mask[16];
10686 SDValue V2Mask[16];
10690 int Size = Mask.size();
10691 int Scale = 16 / Size;
10692 for (int i = 0; i < 16; ++i) {
10693 if (Mask[i / Scale] < 0) {
10694 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
10696 const int ZeroMask = 0x80;
10697 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
10699 int V2Idx = Mask[i / Scale] < Size
10701 : (Mask[i / Scale] - Size) * Scale + i % Scale;
10702 if (Zeroable[i / Scale])
10703 V1Idx = V2Idx = ZeroMask;
10704 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
10705 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
10706 V1InUse |= (ZeroMask != V1Idx);
10707 V2InUse |= (ZeroMask != V2Idx);
10712 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10713 DAG.getBitcast(MVT::v16i8, V1),
10714 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
10716 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10717 DAG.getBitcast(MVT::v16i8, V2),
10718 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
10720 // If we need shuffled inputs from both, blend the two.
10722 if (V1InUse && V2InUse)
10723 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
10725 V = V1InUse ? V1 : V2;
10727 // Cast the result back to the correct type.
10728 return DAG.getBitcast(VT, V);
10731 /// \brief Generic lowering of 8-lane i16 shuffles.
10733 /// This handles both single-input shuffles and combined shuffle/blends with
10734 /// two inputs. The single input shuffles are immediately delegated to
10735 /// a dedicated lowering routine.
10737 /// The blends are lowered in one of three fundamental ways. If there are few
10738 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
10739 /// of the input is significantly cheaper when lowered as an interleaving of
10740 /// the two inputs, try to interleave them. Otherwise, blend the low and high
10741 /// halves of the inputs separately (making them have relatively few inputs)
10742 /// and then concatenate them.
10743 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10744 const SmallBitVector &Zeroable,
10745 SDValue V1, SDValue V2,
10746 const X86Subtarget &Subtarget,
10747 SelectionDAG &DAG) {
10748 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10749 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10750 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10752 // Whenever we can lower this as a zext, that instruction is strictly faster
10753 // than any alternative.
10754 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10755 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10758 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
10760 if (NumV2Inputs == 0) {
10761 // Check for being able to broadcast a single element.
10762 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10763 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10766 // Try to use shift instructions.
10767 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
10768 Zeroable, Subtarget, DAG))
10771 // Use dedicated unpack instructions for masks that match their pattern.
10773 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10776 // Try to use byte rotation instructions.
10777 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
10778 Mask, Subtarget, DAG))
10781 // Make a copy of the mask so it can be modified.
10782 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
10783 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
10784 MutableMask, Subtarget,
10788 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
10789 "All single-input shuffles should be canonicalized to be V1-input "
10792 // Try to use shift instructions.
10793 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
10794 Zeroable, Subtarget, DAG))
10797 // See if we can use SSE4A Extraction / Insertion.
10798 if (Subtarget.hasSSE4A())
10799 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
10803 // There are special ways we can lower some single-element blends.
10804 if (NumV2Inputs == 1)
10805 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10806 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10809 // We have different paths for blend lowering, but they all must use the
10810 // *exact* same predicate.
10811 bool IsBlendSupported = Subtarget.hasSSE41();
10812 if (IsBlendSupported)
10813 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
10814 Zeroable, Subtarget, DAG))
10817 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
10821 // Use dedicated unpack instructions for masks that match their pattern.
10823 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10826 // Try to use byte rotation instructions.
10827 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10828 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10831 if (SDValue BitBlend =
10832 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10835 // Try to lower by permuting the inputs into an unpack instruction.
10836 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10840 // If we can't directly blend but can use PSHUFB, that will be better as it
10841 // can both shuffle and set up the inefficient blend.
10842 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10843 bool V1InUse, V2InUse;
10844 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
10845 Zeroable, DAG, V1InUse, V2InUse);
10848 // We can always bit-blend if we have to so the fallback strategy is to
10849 // decompose into single-input permutes and blends.
10850 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
10854 /// \brief Check whether a compaction lowering can be done by dropping even
10855 /// elements and compute how many times even elements must be dropped.
10857 /// This handles shuffles which take every Nth element where N is a power of
10858 /// two. Example shuffle masks:
10860 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
10861 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
10862 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
10863 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
10864 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
10865 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
10867 /// Any of these lanes can of course be undef.
10869 /// This routine only supports N <= 3.
10870 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
10873 /// \returns N above, or the number of times even elements must be dropped if
10874 /// there is such a number. Otherwise returns zero.
10875 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
10876 bool IsSingleInput) {
10877 // The modulus for the shuffle vector entries is based on whether this is
10878 // a single input or not.
10879 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
10880 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
10881 "We should only be called with masks with a power-of-2 size!");
10883 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
10885 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
10886 // and 2^3 simultaneously. This is because we may have ambiguity with
10887 // partially undef inputs.
10888 bool ViableForN[3] = {true, true, true};
10890 for (int i = 0, e = Mask.size(); i < e; ++i) {
10891 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
10896 bool IsAnyViable = false;
10897 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10898 if (ViableForN[j]) {
10899 uint64_t N = j + 1;
10901 // The shuffle mask must be equal to (i * 2^N) % M.
10902 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
10903 IsAnyViable = true;
10905 ViableForN[j] = false;
10907 // Early exit if we exhaust the possible powers of two.
10912 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
10916 // Return 0 as there is no viable power of two.
10920 /// \brief Generic lowering of v16i8 shuffles.
10922 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
10923 /// detect any complexity reducing interleaving. If that doesn't help, it uses
10924 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
10925 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
10927 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10928 const SmallBitVector &Zeroable,
10929 SDValue V1, SDValue V2,
10930 const X86Subtarget &Subtarget,
10931 SelectionDAG &DAG) {
10932 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10933 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
10934 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
10936 // Try to use shift instructions.
10937 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
10938 Zeroable, Subtarget, DAG))
10941 // Try to use byte rotation instructions.
10942 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10943 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10946 // Try to use a zext lowering.
10947 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10948 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
10951 // See if we can use SSE4A Extraction / Insertion.
10952 if (Subtarget.hasSSE4A())
10953 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
10957 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
10959 // For single-input shuffles, there are some nicer lowering tricks we can use.
10960 if (NumV2Elements == 0) {
10961 // Check for being able to broadcast a single element.
10962 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10963 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
10966 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
10967 // Notably, this handles splat and partial-splat shuffles more efficiently.
10968 // However, it only makes sense if the pre-duplication shuffle simplifies
10969 // things significantly. Currently, this means we need to be able to
10970 // express the pre-duplication shuffle as an i16 shuffle.
10972 // FIXME: We should check for other patterns which can be widened into an
10973 // i16 shuffle as well.
10974 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
10975 for (int i = 0; i < 16; i += 2)
10976 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
10981 auto tryToWidenViaDuplication = [&]() -> SDValue {
10982 if (!canWidenViaDuplication(Mask))
10984 SmallVector<int, 4> LoInputs;
10985 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
10986 [](int M) { return M >= 0 && M < 8; });
10987 std::sort(LoInputs.begin(), LoInputs.end());
10988 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
10990 SmallVector<int, 4> HiInputs;
10991 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
10992 [](int M) { return M >= 8; });
10993 std::sort(HiInputs.begin(), HiInputs.end());
10994 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
10997 bool TargetLo = LoInputs.size() >= HiInputs.size();
10998 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
10999 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11001 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11002 SmallDenseMap<int, int, 8> LaneMap;
11003 for (int I : InPlaceInputs) {
11004 PreDupI16Shuffle[I/2] = I/2;
11007 int j = TargetLo ? 0 : 4, je = j + 4;
11008 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11009 // Check if j is already a shuffle of this input. This happens when
11010 // there are two adjacent bytes after we move the low one.
11011 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11012 // If we haven't yet mapped the input, search for a slot into which
11014 while (j < je && PreDupI16Shuffle[j] >= 0)
11018 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11021 // Map this input with the i16 shuffle.
11022 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11025 // Update the lane map based on the mapping we ended up with.
11026 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11028 V1 = DAG.getBitcast(
11030 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11031 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11033 // Unpack the bytes to form the i16s that will be shuffled into place.
11034 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11035 MVT::v16i8, V1, V1);
11037 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11038 for (int i = 0; i < 16; ++i)
11039 if (Mask[i] >= 0) {
11040 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11041 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11042 if (PostDupI16Shuffle[i / 2] < 0)
11043 PostDupI16Shuffle[i / 2] = MappedMask;
11045 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11046 "Conflicting entrties in the original shuffle!");
11048 return DAG.getBitcast(
11050 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11051 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11053 if (SDValue V = tryToWidenViaDuplication())
11057 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11061 // Use dedicated unpack instructions for masks that match their pattern.
11063 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11066 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11067 // with PSHUFB. It is important to do this before we attempt to generate any
11068 // blends but after all of the single-input lowerings. If the single input
11069 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11070 // want to preserve that and we can DAG combine any longer sequences into
11071 // a PSHUFB in the end. But once we start blending from multiple inputs,
11072 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11073 // and there are *very* few patterns that would actually be faster than the
11074 // PSHUFB approach because of its ability to zero lanes.
11076 // FIXME: The only exceptions to the above are blends which are exact
11077 // interleavings with direct instructions supporting them. We currently don't
11078 // handle those well here.
11079 if (Subtarget.hasSSSE3()) {
11080 bool V1InUse = false;
11081 bool V2InUse = false;
11083 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11084 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11086 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11087 // do so. This avoids using them to handle blends-with-zero which is
11088 // important as a single pshufb is significantly faster for that.
11089 if (V1InUse && V2InUse) {
11090 if (Subtarget.hasSSE41())
11091 if (SDValue Blend = lowerVectorShuffleAsBlend(
11092 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11095 // We can use an unpack to do the blending rather than an or in some
11096 // cases. Even though the or may be (very minorly) more efficient, we
11097 // preference this lowering because there are common cases where part of
11098 // the complexity of the shuffles goes away when we do the final blend as
11100 // FIXME: It might be worth trying to detect if the unpack-feeding
11101 // shuffles will both be pshufb, in which case we shouldn't bother with
11103 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11104 DL, MVT::v16i8, V1, V2, Mask, DAG))
11111 // There are special ways we can lower some single-element blends.
11112 if (NumV2Elements == 1)
11113 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11114 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11117 if (SDValue BitBlend =
11118 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11121 // Check whether a compaction lowering can be done. This handles shuffles
11122 // which take every Nth element for some even N. See the helper function for
11125 // We special case these as they can be particularly efficiently handled with
11126 // the PACKUSB instruction on x86 and they show up in common patterns of
11127 // rearranging bytes to truncate wide elements.
11128 bool IsSingleInput = V2.isUndef();
11129 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11130 // NumEvenDrops is the power of two stride of the elements. Another way of
11131 // thinking about it is that we need to drop the even elements this many
11132 // times to get the original input.
11134 // First we need to zero all the dropped bytes.
11135 assert(NumEvenDrops <= 3 &&
11136 "No support for dropping even elements more than 3 times.");
11137 // We use the mask type to pick which bytes are preserved based on how many
11138 // elements are dropped.
11139 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11140 SDValue ByteClearMask = DAG.getBitcast(
11141 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11142 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11143 if (!IsSingleInput)
11144 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11146 // Now pack things back together.
11147 V1 = DAG.getBitcast(MVT::v8i16, V1);
11148 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11149 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11150 for (int i = 1; i < NumEvenDrops; ++i) {
11151 Result = DAG.getBitcast(MVT::v8i16, Result);
11152 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11158 // Handle multi-input cases by blending single-input shuffles.
11159 if (NumV2Elements > 0)
11160 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11163 // The fallback path for single-input shuffles widens this into two v8i16
11164 // vectors with unpacks, shuffles those, and then pulls them back together
11168 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11169 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11170 for (int i = 0; i < 16; ++i)
11172 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11174 SDValue VLoHalf, VHiHalf;
11175 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11176 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11178 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11179 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11180 // Use a mask to drop the high bytes.
11181 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11182 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11183 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11185 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11186 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11188 // Squash the masks to point directly into VLoHalf.
11189 for (int &M : LoBlendMask)
11192 for (int &M : HiBlendMask)
11196 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11197 // VHiHalf so that we can blend them as i16s.
11198 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11200 VLoHalf = DAG.getBitcast(
11201 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11202 VHiHalf = DAG.getBitcast(
11203 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11206 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11207 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11209 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11212 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11214 /// This routine breaks down the specific type of 128-bit shuffle and
11215 /// dispatches to the lowering routines accordingly.
11216 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11217 MVT VT, SDValue V1, SDValue V2,
11218 const SmallBitVector &Zeroable,
11219 const X86Subtarget &Subtarget,
11220 SelectionDAG &DAG) {
11221 switch (VT.SimpleTy) {
11223 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11225 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11227 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11229 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11231 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11233 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11236 llvm_unreachable("Unimplemented!");
11240 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11242 /// This routine just extracts two subvectors, shuffles them independently, and
11243 /// then concatenates them back together. This should work effectively with all
11244 /// AVX vector shuffle types.
11245 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11246 SDValue V2, ArrayRef<int> Mask,
11247 SelectionDAG &DAG) {
11248 assert(VT.getSizeInBits() >= 256 &&
11249 "Only for 256-bit or wider vector shuffles!");
11250 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11251 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11253 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11254 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11256 int NumElements = VT.getVectorNumElements();
11257 int SplitNumElements = NumElements / 2;
11258 MVT ScalarVT = VT.getVectorElementType();
11259 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11261 // Rather than splitting build-vectors, just build two narrower build
11262 // vectors. This helps shuffling with splats and zeros.
11263 auto SplitVector = [&](SDValue V) {
11264 V = peekThroughBitcasts(V);
11266 MVT OrigVT = V.getSimpleValueType();
11267 int OrigNumElements = OrigVT.getVectorNumElements();
11268 int OrigSplitNumElements = OrigNumElements / 2;
11269 MVT OrigScalarVT = OrigVT.getVectorElementType();
11270 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11274 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11276 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11277 DAG.getIntPtrConstant(0, DL));
11278 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11279 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11282 SmallVector<SDValue, 16> LoOps, HiOps;
11283 for (int i = 0; i < OrigSplitNumElements; ++i) {
11284 LoOps.push_back(BV->getOperand(i));
11285 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11287 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11288 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11290 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11291 DAG.getBitcast(SplitVT, HiV));
11294 SDValue LoV1, HiV1, LoV2, HiV2;
11295 std::tie(LoV1, HiV1) = SplitVector(V1);
11296 std::tie(LoV2, HiV2) = SplitVector(V2);
11298 // Now create two 4-way blends of these half-width vectors.
11299 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11300 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11301 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11302 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11303 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11304 for (int i = 0; i < SplitNumElements; ++i) {
11305 int M = HalfMask[i];
11306 if (M >= NumElements) {
11307 if (M >= NumElements + SplitNumElements)
11311 V2BlendMask[i] = M - NumElements;
11312 BlendMask[i] = SplitNumElements + i;
11313 } else if (M >= 0) {
11314 if (M >= SplitNumElements)
11318 V1BlendMask[i] = M;
11323 // Because the lowering happens after all combining takes place, we need to
11324 // manually combine these blend masks as much as possible so that we create
11325 // a minimal number of high-level vector shuffle nodes.
11327 // First try just blending the halves of V1 or V2.
11328 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11329 return DAG.getUNDEF(SplitVT);
11330 if (!UseLoV2 && !UseHiV2)
11331 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11332 if (!UseLoV1 && !UseHiV1)
11333 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11335 SDValue V1Blend, V2Blend;
11336 if (UseLoV1 && UseHiV1) {
11338 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11340 // We only use half of V1 so map the usage down into the final blend mask.
11341 V1Blend = UseLoV1 ? LoV1 : HiV1;
11342 for (int i = 0; i < SplitNumElements; ++i)
11343 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11344 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11346 if (UseLoV2 && UseHiV2) {
11348 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11350 // We only use half of V2 so map the usage down into the final blend mask.
11351 V2Blend = UseLoV2 ? LoV2 : HiV2;
11352 for (int i = 0; i < SplitNumElements; ++i)
11353 if (BlendMask[i] >= SplitNumElements)
11354 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11356 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11358 SDValue Lo = HalfBlend(LoMask);
11359 SDValue Hi = HalfBlend(HiMask);
11360 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11363 /// \brief Either split a vector in halves or decompose the shuffles and the
11366 /// This is provided as a good fallback for many lowerings of non-single-input
11367 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11368 /// between splitting the shuffle into 128-bit components and stitching those
11369 /// back together vs. extracting the single-input shuffles and blending those
11371 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11372 SDValue V1, SDValue V2,
11373 ArrayRef<int> Mask,
11374 SelectionDAG &DAG) {
11375 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11376 "shuffles as it could then recurse on itself.");
11377 int Size = Mask.size();
11379 // If this can be modeled as a broadcast of two elements followed by a blend,
11380 // prefer that lowering. This is especially important because broadcasts can
11381 // often fold with memory operands.
11382 auto DoBothBroadcast = [&] {
11383 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11386 if (V2BroadcastIdx < 0)
11387 V2BroadcastIdx = M - Size;
11388 else if (M - Size != V2BroadcastIdx)
11390 } else if (M >= 0) {
11391 if (V1BroadcastIdx < 0)
11392 V1BroadcastIdx = M;
11393 else if (M != V1BroadcastIdx)
11398 if (DoBothBroadcast())
11399 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11402 // If the inputs all stem from a single 128-bit lane of each input, then we
11403 // split them rather than blending because the split will decompose to
11404 // unusually few instructions.
11405 int LaneCount = VT.getSizeInBits() / 128;
11406 int LaneSize = Size / LaneCount;
11407 SmallBitVector LaneInputs[2];
11408 LaneInputs[0].resize(LaneCount, false);
11409 LaneInputs[1].resize(LaneCount, false);
11410 for (int i = 0; i < Size; ++i)
11412 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11413 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11414 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11416 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11417 // that the decomposed single-input shuffles don't end up here.
11418 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11421 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11422 /// a permutation and blend of those lanes.
11424 /// This essentially blends the out-of-lane inputs to each lane into the lane
11425 /// from a permuted copy of the vector. This lowering strategy results in four
11426 /// instructions in the worst case for a single-input cross lane shuffle which
11427 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11428 /// of. Special cases for each particular shuffle pattern should be handled
11429 /// prior to trying this lowering.
11430 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11431 SDValue V1, SDValue V2,
11432 ArrayRef<int> Mask,
11433 SelectionDAG &DAG) {
11434 // FIXME: This should probably be generalized for 512-bit vectors as well.
11435 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11436 int Size = Mask.size();
11437 int LaneSize = Size / 2;
11439 // If there are only inputs from one 128-bit lane, splitting will in fact be
11440 // less expensive. The flags track whether the given lane contains an element
11441 // that crosses to another lane.
11442 bool LaneCrossing[2] = {false, false};
11443 for (int i = 0; i < Size; ++i)
11444 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11445 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11446 if (!LaneCrossing[0] || !LaneCrossing[1])
11447 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11449 assert(V2.isUndef() &&
11450 "This last part of this routine only works on single input shuffles");
11452 SmallVector<int, 32> FlippedBlendMask(Size);
11453 for (int i = 0; i < Size; ++i)
11454 FlippedBlendMask[i] =
11455 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11457 : Mask[i] % LaneSize +
11458 (i / LaneSize) * LaneSize + Size);
11460 // Flip the vector, and blend the results which should now be in-lane. The
11461 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11462 // 5 for the high source. The value 3 selects the high half of source 2 and
11463 // the value 2 selects the low half of source 2. We only use source 2 to
11464 // allow folding it into a memory operand.
11465 unsigned PERMMask = 3 | 2 << 4;
11466 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11467 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11468 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11471 /// \brief Handle lowering 2-lane 128-bit shuffles.
11472 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11473 SDValue V2, ArrayRef<int> Mask,
11474 const SmallBitVector &Zeroable,
11475 const X86Subtarget &Subtarget,
11476 SelectionDAG &DAG) {
11477 SmallVector<int, 4> WidenedMask;
11478 if (!canWidenShuffleElements(Mask, WidenedMask))
11481 // TODO: If minimizing size and one of the inputs is a zero vector and the
11482 // the zero vector has only one use, we could use a VPERM2X128 to save the
11483 // instruction bytes needed to explicitly generate the zero vector.
11485 // Blends are faster and handle all the non-lane-crossing cases.
11486 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11487 Zeroable, Subtarget, DAG))
11490 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11491 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11493 // If either input operand is a zero vector, use VPERM2X128 because its mask
11494 // allows us to replace the zero input with an implicit zero.
11495 if (!IsV1Zero && !IsV2Zero) {
11496 // Check for patterns which can be matched with a single insert of a 128-bit
11498 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11499 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11500 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11501 if (Subtarget.hasAVX2() && V2.isUndef())
11504 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11505 VT.getVectorNumElements() / 2);
11506 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11507 DAG.getIntPtrConstant(0, DL));
11508 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11509 OnlyUsesV1 ? V1 : V2,
11510 DAG.getIntPtrConstant(0, DL));
11511 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11515 // Otherwise form a 128-bit permutation. After accounting for undefs,
11516 // convert the 64-bit shuffle mask selection values into 128-bit
11517 // selection bits by dividing the indexes by 2 and shifting into positions
11518 // defined by a vperm2*128 instruction's immediate control byte.
11520 // The immediate permute control byte looks like this:
11521 // [1:0] - select 128 bits from sources for low half of destination
11523 // [3] - zero low half of destination
11524 // [5:4] - select 128 bits from sources for high half of destination
11526 // [7] - zero high half of destination
11528 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11529 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11531 unsigned PermMask = MaskLO | (MaskHI << 4);
11533 // If either input is a zero vector, replace it with an undef input.
11534 // Shuffle mask values < 4 are selecting elements of V1.
11535 // Shuffle mask values >= 4 are selecting elements of V2.
11536 // Adjust each half of the permute mask by clearing the half that was
11537 // selecting the zero vector and setting the zero mask bit.
11539 V1 = DAG.getUNDEF(VT);
11541 PermMask = (PermMask & 0xf0) | 0x08;
11543 PermMask = (PermMask & 0x0f) | 0x80;
11546 V2 = DAG.getUNDEF(VT);
11548 PermMask = (PermMask & 0xf0) | 0x08;
11550 PermMask = (PermMask & 0x0f) | 0x80;
11553 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
11554 DAG.getConstant(PermMask, DL, MVT::i8));
11557 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
11558 /// shuffling each lane.
11560 /// This will only succeed when the result of fixing the 128-bit lanes results
11561 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
11562 /// each 128-bit lanes. This handles many cases where we can quickly blend away
11563 /// the lane crosses early and then use simpler shuffles within each lane.
11565 /// FIXME: It might be worthwhile at some point to support this without
11566 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
11567 /// in x86 only floating point has interesting non-repeating shuffles, and even
11568 /// those are still *marginally* more expensive.
11569 static SDValue lowerVectorShuffleByMerging128BitLanes(
11570 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11571 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11572 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
11574 int Size = Mask.size();
11575 int LaneSize = 128 / VT.getScalarSizeInBits();
11576 int NumLanes = Size / LaneSize;
11577 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
11579 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
11580 // check whether the in-128-bit lane shuffles share a repeating pattern.
11581 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
11582 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
11583 for (int i = 0; i < Size; ++i) {
11587 int j = i / LaneSize;
11589 if (Lanes[j] < 0) {
11590 // First entry we've seen for this lane.
11591 Lanes[j] = Mask[i] / LaneSize;
11592 } else if (Lanes[j] != Mask[i] / LaneSize) {
11593 // This doesn't match the lane selected previously!
11597 // Check that within each lane we have a consistent shuffle mask.
11598 int k = i % LaneSize;
11599 if (InLaneMask[k] < 0) {
11600 InLaneMask[k] = Mask[i] % LaneSize;
11601 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
11602 // This doesn't fit a repeating in-lane mask.
11607 // First shuffle the lanes into place.
11608 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
11609 VT.getSizeInBits() / 64);
11610 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
11611 for (int i = 0; i < NumLanes; ++i)
11612 if (Lanes[i] >= 0) {
11613 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
11614 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
11617 V1 = DAG.getBitcast(LaneVT, V1);
11618 V2 = DAG.getBitcast(LaneVT, V2);
11619 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
11621 // Cast it back to the type we actually want.
11622 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
11624 // Now do a simple shuffle that isn't lane crossing.
11625 SmallVector<int, 8> NewMask((unsigned)Size, -1);
11626 for (int i = 0; i < Size; ++i)
11628 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
11629 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
11630 "Must not introduce lane crosses at this point!");
11632 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
11635 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
11636 /// This allows for fast cases such as subvector extraction/insertion
11637 /// or shuffling smaller vector types which can lower more efficiently.
11638 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
11639 SDValue V1, SDValue V2,
11640 ArrayRef<int> Mask,
11641 const X86Subtarget &Subtarget,
11642 SelectionDAG &DAG) {
11643 assert(VT.is256BitVector() && "Expected 256-bit vector");
11645 unsigned NumElts = VT.getVectorNumElements();
11646 unsigned HalfNumElts = NumElts / 2;
11647 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
11649 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
11650 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
11651 if (!UndefLower && !UndefUpper)
11654 // Upper half is undef and lower half is whole upper subvector.
11655 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11657 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
11658 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11659 DAG.getIntPtrConstant(HalfNumElts, DL));
11660 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11661 DAG.getIntPtrConstant(0, DL));
11664 // Lower half is undef and upper half is whole lower subvector.
11665 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11667 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
11668 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11669 DAG.getIntPtrConstant(0, DL));
11670 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11671 DAG.getIntPtrConstant(HalfNumElts, DL));
11674 // If the shuffle only uses two of the four halves of the input operands,
11675 // then extract them and perform the 'half' shuffle at half width.
11676 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
11677 int HalfIdx1 = -1, HalfIdx2 = -1;
11678 SmallVector<int, 8> HalfMask(HalfNumElts);
11679 unsigned Offset = UndefLower ? HalfNumElts : 0;
11680 for (unsigned i = 0; i != HalfNumElts; ++i) {
11681 int M = Mask[i + Offset];
11687 // Determine which of the 4 half vectors this element is from.
11688 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
11689 int HalfIdx = M / HalfNumElts;
11691 // Determine the element index into its half vector source.
11692 int HalfElt = M % HalfNumElts;
11694 // We can shuffle with up to 2 half vectors, set the new 'half'
11695 // shuffle mask accordingly.
11696 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
11697 HalfMask[i] = HalfElt;
11698 HalfIdx1 = HalfIdx;
11701 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
11702 HalfMask[i] = HalfElt + HalfNumElts;
11703 HalfIdx2 = HalfIdx;
11707 // Too many half vectors referenced.
11710 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
11712 // Only shuffle the halves of the inputs when useful.
11713 int NumLowerHalves =
11714 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
11715 int NumUpperHalves =
11716 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
11718 // uuuuXXXX - don't extract uppers just to insert again.
11719 if (UndefLower && NumUpperHalves != 0)
11722 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
11723 if (UndefUpper && NumUpperHalves == 2)
11726 // AVX2 - XXXXuuuu - always extract lowers.
11727 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
11728 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
11729 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11731 // AVX2 supports variable 32-bit element cross-lane shuffles.
11732 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
11733 // XXXXuuuu - don't extract lowers and uppers.
11734 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
11739 auto GetHalfVector = [&](int HalfIdx) {
11741 return DAG.getUNDEF(HalfVT);
11742 SDValue V = (HalfIdx < 2 ? V1 : V2);
11743 HalfIdx = (HalfIdx % 2) * HalfNumElts;
11744 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
11745 DAG.getIntPtrConstant(HalfIdx, DL));
11748 SDValue Half1 = GetHalfVector(HalfIdx1);
11749 SDValue Half2 = GetHalfVector(HalfIdx2);
11750 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
11751 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
11752 DAG.getIntPtrConstant(Offset, DL));
11755 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
11758 /// This returns true if the elements from a particular input are already in the
11759 /// slot required by the given mask and require no permutation.
11760 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
11761 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
11762 int Size = Mask.size();
11763 for (int i = 0; i < Size; ++i)
11764 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
11770 /// Handle case where shuffle sources are coming from the same 128-bit lane and
11771 /// every lane can be represented as the same repeating mask - allowing us to
11772 /// shuffle the sources with the repeating shuffle and then permute the result
11773 /// to the destination lanes.
11774 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
11775 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11776 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11777 int NumElts = VT.getVectorNumElements();
11778 int NumLanes = VT.getSizeInBits() / 128;
11779 int NumLaneElts = NumElts / NumLanes;
11781 // On AVX2 we may be able to just shuffle the lowest elements and then
11782 // broadcast the result.
11783 if (Subtarget.hasAVX2()) {
11784 for (unsigned BroadcastSize : {16, 32, 64}) {
11785 if (BroadcastSize <= VT.getScalarSizeInBits())
11787 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11789 // Attempt to match a repeating pattern every NumBroadcastElts,
11790 // accounting for UNDEFs but only references the lowest 128-bit
11791 // lane of the inputs.
11792 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11793 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11794 for (int j = 0; j != NumBroadcastElts; ++j) {
11795 int M = Mask[i + j];
11798 int &R = RepeatMask[j];
11799 if (0 != ((M % NumElts) / NumLaneElts))
11801 if (0 <= R && R != M)
11808 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11809 if (!FindRepeatingBroadcastMask(RepeatMask))
11812 // Shuffle the (lowest) repeated elements in place for broadcast.
11813 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11815 // Shuffle the actual broadcast.
11816 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11817 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11818 for (int j = 0; j != NumBroadcastElts; ++j)
11819 BroadcastMask[i + j] = j;
11820 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11825 // Bail if the shuffle mask doesn't cross 128-bit lanes.
11826 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11829 // Bail if we already have a repeated lane shuffle mask.
11830 SmallVector<int, 8> RepeatedShuffleMask;
11831 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11834 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11835 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11836 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11837 int NumSubLanes = NumLanes * SubLaneScale;
11838 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11840 // Check that all the sources are coming from the same lane and see if we can
11841 // form a repeating shuffle mask (local to each sub-lane). At the same time,
11842 // determine the source sub-lane for each destination sub-lane.
11843 int TopSrcSubLane = -1;
11844 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11845 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11846 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11847 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11849 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
11850 // Extract the sub-lane mask, check that it all comes from the same lane
11851 // and normalize the mask entries to come from the first lane.
11853 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
11854 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11855 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
11858 int Lane = (M % NumElts) / NumLaneElts;
11859 if ((0 <= SrcLane) && (SrcLane != Lane))
11862 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
11863 SubLaneMask[Elt] = LocalM;
11866 // Whole sub-lane is UNDEF.
11870 // Attempt to match against the candidate repeated sub-lane masks.
11871 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
11872 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
11873 for (int i = 0; i != NumSubLaneElts; ++i) {
11874 if (M1[i] < 0 || M2[i] < 0)
11876 if (M1[i] != M2[i])
11882 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
11883 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
11886 // Merge the sub-lane mask into the matching repeated sub-lane mask.
11887 for (int i = 0; i != NumSubLaneElts; ++i) {
11888 int M = SubLaneMask[i];
11891 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
11892 "Unexpected mask element");
11893 RepeatedSubLaneMask[i] = M;
11896 // Track the top most source sub-lane - by setting the remaining to UNDEF
11897 // we can greatly simplify shuffle matching.
11898 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
11899 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
11900 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
11904 // Bail if we failed to find a matching repeated sub-lane mask.
11905 if (Dst2SrcSubLanes[DstSubLane] < 0)
11908 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
11909 "Unexpected source lane");
11911 // Create a repeating shuffle mask for the entire vector.
11912 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
11913 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
11914 int Lane = SubLane / SubLaneScale;
11915 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
11916 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
11917 int M = RepeatedSubLaneMask[Elt];
11920 int Idx = (SubLane * NumSubLaneElts) + Elt;
11921 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
11924 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
11926 // Shuffle each source sub-lane to its destination.
11927 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
11928 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
11929 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
11930 if (SrcSubLane < 0)
11932 for (int j = 0; j != NumSubLaneElts; ++j)
11933 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
11936 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
11940 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
11941 unsigned &ShuffleImm,
11942 ArrayRef<int> Mask) {
11943 int NumElts = VT.getVectorNumElements();
11944 assert(VT.getScalarType() == MVT::f64 &&
11945 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
11946 "Unexpected data type for VSHUFPD");
11948 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
11949 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
11951 bool ShufpdMask = true;
11952 bool CommutableMask = true;
11953 for (int i = 0; i < NumElts; ++i) {
11954 if (Mask[i] == SM_SentinelUndef)
11958 int Val = (i & 6) + NumElts * (i & 1);
11959 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
11960 if (Mask[i] < Val || Mask[i] > Val + 1)
11961 ShufpdMask = false;
11962 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
11963 CommutableMask = false;
11964 ShuffleImm |= (Mask[i] % 2) << i;
11969 if (CommutableMask) {
11977 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
11978 ArrayRef<int> Mask, SDValue V1,
11979 SDValue V2, SelectionDAG &DAG) {
11980 unsigned Immediate = 0;
11981 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
11984 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
11985 DAG.getConstant(Immediate, DL, MVT::i8));
11988 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
11989 ArrayRef<int> Mask, SDValue V1,
11990 SDValue V2, SelectionDAG &DAG) {
11991 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
11992 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
11994 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
11996 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
11998 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12001 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12003 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12004 /// isn't available.
12005 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12006 const SmallBitVector &Zeroable,
12007 SDValue V1, SDValue V2,
12008 const X86Subtarget &Subtarget,
12009 SelectionDAG &DAG) {
12010 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12011 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12012 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12014 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12015 Zeroable, Subtarget, DAG))
12018 if (V2.isUndef()) {
12019 // Check for being able to broadcast a single element.
12020 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12021 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12024 // Use low duplicate instructions for masks that match their pattern.
12025 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12026 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12028 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12029 // Non-half-crossing single input shuffles can be lowered with an
12030 // interleaved permutation.
12031 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12032 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12033 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12034 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12037 // With AVX2 we have direct support for this permutation.
12038 if (Subtarget.hasAVX2())
12039 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12040 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12042 // Try to create an in-lane repeating shuffle mask and then shuffle the
12043 // the results into the target lanes.
12044 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12045 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12048 // Otherwise, fall back.
12049 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12053 // Use dedicated unpack instructions for masks that match their pattern.
12055 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12058 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12059 Zeroable, Subtarget, DAG))
12062 // Check if the blend happens to exactly fit that of SHUFPD.
12064 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12067 // Try to create an in-lane repeating shuffle mask and then shuffle the
12068 // the results into the target lanes.
12069 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12070 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12073 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12074 // shuffle. However, if we have AVX2 and either inputs are already in place,
12075 // we will be able to shuffle even across lanes the other input in a single
12076 // instruction so skip this pattern.
12077 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12078 isShuffleMaskInputInPlace(1, Mask))))
12079 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12080 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12083 // If we have AVX2 then we always want to lower with a blend because an v4 we
12084 // can fully permute the elements.
12085 if (Subtarget.hasAVX2())
12086 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12089 // Otherwise fall back on generic lowering.
12090 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12093 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12095 /// This routine is only called when we have AVX2 and thus a reasonable
12096 /// instruction set for v4i64 shuffling..
12097 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12098 const SmallBitVector &Zeroable,
12099 SDValue V1, SDValue V2,
12100 const X86Subtarget &Subtarget,
12101 SelectionDAG &DAG) {
12102 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12103 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12104 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12105 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12107 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12108 Zeroable, Subtarget, DAG))
12111 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12112 Zeroable, Subtarget, DAG))
12115 // Check for being able to broadcast a single element.
12116 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12117 Mask, Subtarget, DAG))
12120 if (V2.isUndef()) {
12121 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12122 // can use lower latency instructions that will operate on both lanes.
12123 SmallVector<int, 2> RepeatedMask;
12124 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12125 SmallVector<int, 4> PSHUFDMask;
12126 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12127 return DAG.getBitcast(
12129 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12130 DAG.getBitcast(MVT::v8i32, V1),
12131 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12134 // AVX2 provides a direct instruction for permuting a single input across
12136 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12137 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12140 // Try to use shift instructions.
12141 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12142 Zeroable, Subtarget, DAG))
12145 // If we have VLX support, we can use VALIGN.
12146 if (Subtarget.hasVLX())
12147 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12148 Mask, Subtarget, DAG))
12151 // Try to use PALIGNR.
12152 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12153 Mask, Subtarget, DAG))
12156 // Use dedicated unpack instructions for masks that match their pattern.
12158 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12161 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12162 // shuffle. However, if we have AVX2 and either inputs are already in place,
12163 // we will be able to shuffle even across lanes the other input in a single
12164 // instruction so skip this pattern.
12165 if (!isShuffleMaskInputInPlace(0, Mask) &&
12166 !isShuffleMaskInputInPlace(1, Mask))
12167 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12168 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12171 // Otherwise fall back on generic blend lowering.
12172 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12176 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12178 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12179 /// isn't available.
12180 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12181 const SmallBitVector &Zeroable,
12182 SDValue V1, SDValue V2,
12183 const X86Subtarget &Subtarget,
12184 SelectionDAG &DAG) {
12185 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12186 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12187 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12189 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12190 Zeroable, Subtarget, DAG))
12193 // Check for being able to broadcast a single element.
12194 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12195 Mask, Subtarget, DAG))
12198 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12199 // options to efficiently lower the shuffle.
12200 SmallVector<int, 4> RepeatedMask;
12201 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12202 assert(RepeatedMask.size() == 4 &&
12203 "Repeated masks must be half the mask width!");
12205 // Use even/odd duplicate instructions for masks that match their pattern.
12206 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12207 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12208 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12209 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12212 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12213 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12215 // Use dedicated unpack instructions for masks that match their pattern.
12217 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12220 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12221 // have already handled any direct blends.
12222 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12225 // Try to create an in-lane repeating shuffle mask and then shuffle the
12226 // the results into the target lanes.
12227 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12228 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12231 // If we have a single input shuffle with different shuffle patterns in the
12232 // two 128-bit lanes use the variable mask to VPERMILPS.
12233 if (V2.isUndef()) {
12234 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12235 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12236 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12238 if (Subtarget.hasAVX2())
12239 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12241 // Otherwise, fall back.
12242 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12246 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12248 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12249 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12252 // If we have AVX2 then we always want to lower with a blend because at v8 we
12253 // can fully permute the elements.
12254 if (Subtarget.hasAVX2())
12255 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12258 // Otherwise fall back on generic lowering.
12259 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12262 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12264 /// This routine is only called when we have AVX2 and thus a reasonable
12265 /// instruction set for v8i32 shuffling..
12266 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12267 const SmallBitVector &Zeroable,
12268 SDValue V1, SDValue V2,
12269 const X86Subtarget &Subtarget,
12270 SelectionDAG &DAG) {
12271 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12272 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12273 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12274 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12276 // Whenever we can lower this as a zext, that instruction is strictly faster
12277 // than any alternative. It also allows us to fold memory operands into the
12278 // shuffle in many cases.
12279 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12280 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12283 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12284 Zeroable, Subtarget, DAG))
12287 // Check for being able to broadcast a single element.
12288 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12289 Mask, Subtarget, DAG))
12292 // If the shuffle mask is repeated in each 128-bit lane we can use more
12293 // efficient instructions that mirror the shuffles across the two 128-bit
12295 SmallVector<int, 4> RepeatedMask;
12296 bool Is128BitLaneRepeatedShuffle =
12297 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12298 if (Is128BitLaneRepeatedShuffle) {
12299 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12301 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12302 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12304 // Use dedicated unpack instructions for masks that match their pattern.
12306 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12310 // Try to use shift instructions.
12311 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12312 Zeroable, Subtarget, DAG))
12315 // If we have VLX support, we can use VALIGN.
12316 if (Subtarget.hasVLX())
12317 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12318 Mask, Subtarget, DAG))
12321 // Try to use byte rotation instructions.
12322 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12323 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12326 // Try to create an in-lane repeating shuffle mask and then shuffle the
12327 // results into the target lanes.
12328 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12329 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12332 // If the shuffle patterns aren't repeated but it is a single input, directly
12333 // generate a cross-lane VPERMD instruction.
12334 if (V2.isUndef()) {
12335 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12336 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12339 // Assume that a single SHUFPS is faster than an alternative sequence of
12340 // multiple instructions (even if the CPU has a domain penalty).
12341 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12342 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12343 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12344 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12345 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12346 CastV1, CastV2, DAG);
12347 return DAG.getBitcast(MVT::v8i32, ShufPS);
12350 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12352 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12353 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12356 // Otherwise fall back on generic blend lowering.
12357 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12361 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12363 /// This routine is only called when we have AVX2 and thus a reasonable
12364 /// instruction set for v16i16 shuffling..
12365 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12366 const SmallBitVector &Zeroable,
12367 SDValue V1, SDValue V2,
12368 const X86Subtarget &Subtarget,
12369 SelectionDAG &DAG) {
12370 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12371 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12372 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12373 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12375 // Whenever we can lower this as a zext, that instruction is strictly faster
12376 // than any alternative. It also allows us to fold memory operands into the
12377 // shuffle in many cases.
12378 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12379 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12382 // Check for being able to broadcast a single element.
12383 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12384 Mask, Subtarget, DAG))
12387 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12388 Zeroable, Subtarget, DAG))
12391 // Use dedicated unpack instructions for masks that match their pattern.
12393 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12396 // Try to use shift instructions.
12397 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12398 Zeroable, Subtarget, DAG))
12401 // Try to use byte rotation instructions.
12402 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12403 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12406 // Try to create an in-lane repeating shuffle mask and then shuffle the
12407 // the results into the target lanes.
12408 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12409 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12412 if (V2.isUndef()) {
12413 // There are no generalized cross-lane shuffle operations available on i16
12415 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12416 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12419 SmallVector<int, 8> RepeatedMask;
12420 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12421 // As this is a single-input shuffle, the repeated mask should be
12422 // a strictly valid v8i16 mask that we can pass through to the v8i16
12423 // lowering to handle even the v16 case.
12424 return lowerV8I16GeneralSingleInputVectorShuffle(
12425 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12429 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12430 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12433 // AVX512BWVL can lower to VPERMW.
12434 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12435 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12437 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12439 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12440 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12443 // Otherwise fall back on generic lowering.
12444 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12447 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12449 /// This routine is only called when we have AVX2 and thus a reasonable
12450 /// instruction set for v32i8 shuffling..
12451 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12452 const SmallBitVector &Zeroable,
12453 SDValue V1, SDValue V2,
12454 const X86Subtarget &Subtarget,
12455 SelectionDAG &DAG) {
12456 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12457 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12458 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12459 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12461 // Whenever we can lower this as a zext, that instruction is strictly faster
12462 // than any alternative. It also allows us to fold memory operands into the
12463 // shuffle in many cases.
12464 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12465 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12468 // Check for being able to broadcast a single element.
12469 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12470 Mask, Subtarget, DAG))
12473 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12474 Zeroable, Subtarget, DAG))
12477 // Use dedicated unpack instructions for masks that match their pattern.
12479 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12482 // Try to use shift instructions.
12483 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12484 Zeroable, Subtarget, DAG))
12487 // Try to use byte rotation instructions.
12488 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12489 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12492 // Try to create an in-lane repeating shuffle mask and then shuffle the
12493 // the results into the target lanes.
12494 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12495 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12498 // There are no generalized cross-lane shuffle operations available on i8
12500 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12501 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
12504 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12505 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12508 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12510 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12511 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12514 // Otherwise fall back on generic lowering.
12515 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
12518 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
12520 /// This routine either breaks down the specific type of a 256-bit x86 vector
12521 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
12522 /// together based on the available instructions.
12523 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12524 MVT VT, SDValue V1, SDValue V2,
12525 const SmallBitVector &Zeroable,
12526 const X86Subtarget &Subtarget,
12527 SelectionDAG &DAG) {
12528 // If we have a single input to the zero element, insert that into V1 if we
12529 // can do so cheaply.
12530 int NumElts = VT.getVectorNumElements();
12531 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12533 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12534 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12535 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12538 // Handle special cases where the lower or upper half is UNDEF.
12540 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
12543 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
12544 // can check for those subtargets here and avoid much of the subtarget
12545 // querying in the per-vector-type lowering routines. With AVX1 we have
12546 // essentially *zero* ability to manipulate a 256-bit vector with integer
12547 // types. Since we'll use floating point types there eventually, just
12548 // immediately cast everything to a float and operate entirely in that domain.
12549 if (VT.isInteger() && !Subtarget.hasAVX2()) {
12550 int ElementBits = VT.getScalarSizeInBits();
12551 if (ElementBits < 32) {
12552 // No floating point type available, if we can't use the bit operations
12553 // for masking/blending then decompose into 128-bit vectors.
12555 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
12557 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12559 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12562 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
12563 VT.getVectorNumElements());
12564 V1 = DAG.getBitcast(FpVT, V1);
12565 V2 = DAG.getBitcast(FpVT, V2);
12566 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
12569 switch (VT.SimpleTy) {
12571 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12573 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12575 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12577 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12579 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12581 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12584 llvm_unreachable("Not a valid 256-bit x86 vector type!");
12588 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
12589 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
12590 ArrayRef<int> Mask, SDValue V1,
12591 SDValue V2, SelectionDAG &DAG) {
12592 assert(VT.getScalarSizeInBits() == 64 &&
12593 "Unexpected element type size for 128bit shuffle.");
12595 // To handle 256 bit vector requires VLX and most probably
12596 // function lowerV2X128VectorShuffle() is better solution.
12597 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
12599 SmallVector<int, 4> WidenedMask;
12600 if (!canWidenShuffleElements(Mask, WidenedMask))
12603 // Check for patterns which can be matched with a single insert of a 256-bit
12605 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
12606 {0, 1, 2, 3, 0, 1, 2, 3});
12607 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
12608 {0, 1, 2, 3, 8, 9, 10, 11})) {
12609 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
12610 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12611 DAG.getIntPtrConstant(0, DL));
12612 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12613 OnlyUsesV1 ? V1 : V2,
12614 DAG.getIntPtrConstant(0, DL));
12615 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12618 assert(WidenedMask.size() == 4);
12620 // See if this is an insertion of the lower 128-bits of V2 into V1.
12621 bool IsInsert = true;
12623 for (int i = 0; i < 4; ++i) {
12624 assert(WidenedMask[i] >= -1);
12625 if (WidenedMask[i] < 0)
12628 // Make sure all V1 subvectors are in place.
12629 if (WidenedMask[i] < 4) {
12630 if (WidenedMask[i] != i) {
12635 // Make sure we only have a single V2 index and its the lowest 128-bits.
12636 if (V2Index >= 0 || WidenedMask[i] != 4) {
12643 if (IsInsert && V2Index >= 0) {
12644 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12645 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
12646 DAG.getIntPtrConstant(0, DL));
12647 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
12650 // Try to lower to to vshuf64x2/vshuf32x4.
12651 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12652 unsigned PermMask = 0;
12653 // Insure elements came from the same Op.
12654 for (int i = 0; i < 4; ++i) {
12655 assert(WidenedMask[i] >= -1);
12656 if (WidenedMask[i] < 0)
12659 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
12660 unsigned OpIndex = i / 2;
12661 if (Ops[OpIndex].isUndef())
12663 else if (Ops[OpIndex] != Op)
12666 // Convert the 128-bit shuffle mask selection values into 128-bit selection
12667 // bits defined by a vshuf64x2 instruction's immediate control byte.
12668 PermMask |= (WidenedMask[i] % 4) << (i * 2);
12671 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
12672 DAG.getConstant(PermMask, DL, MVT::i8));
12675 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
12676 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12677 SDValue V1, SDValue V2,
12678 const X86Subtarget &Subtarget,
12679 SelectionDAG &DAG) {
12680 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12681 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12682 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12684 if (V2.isUndef()) {
12685 // Use low duplicate instructions for masks that match their pattern.
12686 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
12687 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
12689 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
12690 // Non-half-crossing single input shuffles can be lowered with an
12691 // interleaved permutation.
12692 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12693 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
12694 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
12695 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
12696 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
12697 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12700 SmallVector<int, 4> RepeatedMask;
12701 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
12702 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
12703 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12706 if (SDValue Shuf128 =
12707 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
12710 if (SDValue Unpck =
12711 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
12714 // Check if the blend happens to exactly fit that of SHUFPD.
12716 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
12719 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
12722 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
12723 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
12724 SDValue V1, SDValue V2,
12725 const X86Subtarget &Subtarget,
12726 SelectionDAG &DAG) {
12727 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12728 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12729 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12731 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12732 // options to efficiently lower the shuffle.
12733 SmallVector<int, 4> RepeatedMask;
12734 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
12735 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12737 // Use even/odd duplicate instructions for masks that match their pattern.
12738 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12739 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
12740 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12741 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
12744 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
12745 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12747 // Use dedicated unpack instructions for masks that match their pattern.
12748 if (SDValue Unpck =
12749 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
12752 // Otherwise, fall back to a SHUFPS sequence.
12753 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
12756 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
12759 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
12760 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12761 const SmallBitVector &Zeroable,
12762 SDValue V1, SDValue V2,
12763 const X86Subtarget &Subtarget,
12764 SelectionDAG &DAG) {
12765 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12766 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12767 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12769 if (SDValue Shuf128 =
12770 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
12773 if (V2.isUndef()) {
12774 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12775 // can use lower latency instructions that will operate on all four
12777 SmallVector<int, 2> Repeated128Mask;
12778 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
12779 SmallVector<int, 4> PSHUFDMask;
12780 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
12781 return DAG.getBitcast(
12783 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
12784 DAG.getBitcast(MVT::v16i32, V1),
12785 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12788 SmallVector<int, 4> Repeated256Mask;
12789 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
12790 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
12791 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
12794 // Try to use shift instructions.
12795 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
12796 Zeroable, Subtarget, DAG))
12799 // Try to use VALIGN.
12800 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
12801 Mask, Subtarget, DAG))
12804 // Try to use PALIGNR.
12805 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
12806 Mask, Subtarget, DAG))
12809 if (SDValue Unpck =
12810 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
12813 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
12816 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
12817 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12818 const SmallBitVector &Zeroable,
12819 SDValue V1, SDValue V2,
12820 const X86Subtarget &Subtarget,
12821 SelectionDAG &DAG) {
12822 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12823 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
12824 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12826 // Whenever we can lower this as a zext, that instruction is strictly faster
12827 // than any alternative. It also allows us to fold memory operands into the
12828 // shuffle in many cases.
12829 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12830 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12833 // If the shuffle mask is repeated in each 128-bit lane we can use more
12834 // efficient instructions that mirror the shuffles across the four 128-bit
12836 SmallVector<int, 4> RepeatedMask;
12837 bool Is128BitLaneRepeatedShuffle =
12838 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
12839 if (Is128BitLaneRepeatedShuffle) {
12840 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12842 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
12843 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12845 // Use dedicated unpack instructions for masks that match their pattern.
12847 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
12851 // Try to use shift instructions.
12852 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
12853 Zeroable, Subtarget, DAG))
12856 // Try to use VALIGN.
12857 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
12858 Mask, Subtarget, DAG))
12861 // Try to use byte rotation instructions.
12862 if (Subtarget.hasBWI())
12863 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12864 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
12867 // Assume that a single SHUFPS is faster than using a permv shuffle.
12868 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12869 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12870 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
12871 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
12872 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
12873 CastV1, CastV2, DAG);
12874 return DAG.getBitcast(MVT::v16i32, ShufPS);
12877 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
12880 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
12881 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12882 const SmallBitVector &Zeroable,
12883 SDValue V1, SDValue V2,
12884 const X86Subtarget &Subtarget,
12885 SelectionDAG &DAG) {
12886 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12887 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
12888 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12889 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
12891 // Whenever we can lower this as a zext, that instruction is strictly faster
12892 // than any alternative. It also allows us to fold memory operands into the
12893 // shuffle in many cases.
12894 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12895 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12898 // Use dedicated unpack instructions for masks that match their pattern.
12900 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
12903 // Try to use shift instructions.
12904 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
12905 Zeroable, Subtarget, DAG))
12908 // Try to use byte rotation instructions.
12909 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12910 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
12913 if (V2.isUndef()) {
12914 SmallVector<int, 8> RepeatedMask;
12915 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
12916 // As this is a single-input shuffle, the repeated mask should be
12917 // a strictly valid v8i16 mask that we can pass through to the v8i16
12918 // lowering to handle even the v32 case.
12919 return lowerV8I16GeneralSingleInputVectorShuffle(
12920 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
12924 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
12927 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
12928 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12929 const SmallBitVector &Zeroable,
12930 SDValue V1, SDValue V2,
12931 const X86Subtarget &Subtarget,
12932 SelectionDAG &DAG) {
12933 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12934 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
12935 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
12936 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
12938 // Whenever we can lower this as a zext, that instruction is strictly faster
12939 // than any alternative. It also allows us to fold memory operands into the
12940 // shuffle in many cases.
12941 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12942 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12945 // Use dedicated unpack instructions for masks that match their pattern.
12947 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
12950 // Try to use shift instructions.
12951 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
12952 Zeroable, Subtarget, DAG))
12955 // Try to use byte rotation instructions.
12956 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12957 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
12960 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12961 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12964 // VBMI can use VPERMV/VPERMV3 byte shuffles.
12965 if (Subtarget.hasVBMI())
12966 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
12968 // FIXME: Implement direct support for this type!
12969 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
12972 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
12974 /// This routine either breaks down the specific type of a 512-bit x86 vector
12975 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
12976 /// together based on the available instructions.
12977 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12978 MVT VT, SDValue V1, SDValue V2,
12979 const SmallBitVector &Zeroable,
12980 const X86Subtarget &Subtarget,
12981 SelectionDAG &DAG) {
12982 assert(Subtarget.hasAVX512() &&
12983 "Cannot lower 512-bit vectors w/ basic ISA!");
12985 // If we have a single input to the zero element, insert that into V1 if we
12986 // can do so cheaply.
12987 int NumElts = Mask.size();
12988 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12990 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12991 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12992 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12995 // Check for being able to broadcast a single element.
12996 if (SDValue Broadcast =
12997 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13000 // Dispatch to each element type for lowering. If we don't have support for
13001 // specific element type shuffles at 512 bits, immediately split them and
13002 // lower them. Each lowering routine of a given type is allowed to assume that
13003 // the requisite ISA extensions for that element type are available.
13004 switch (VT.SimpleTy) {
13006 return lowerV8F64VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13008 return lowerV16F32VectorShuffle(DL, Mask, V1, V2, Subtarget, DAG);
13010 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13012 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13014 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13016 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13019 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13023 // Lower vXi1 vector shuffles.
13024 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13025 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13026 // vector, shuffle and then truncate it back.
13027 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13028 MVT VT, SDValue V1, SDValue V2,
13029 const X86Subtarget &Subtarget,
13030 SelectionDAG &DAG) {
13031 assert(Subtarget.hasAVX512() &&
13032 "Cannot lower 512-bit vectors w/o basic ISA!");
13034 switch (VT.SimpleTy) {
13036 llvm_unreachable("Expected a vector of i1 elements");
13038 ExtVT = MVT::v2i64;
13041 ExtVT = MVT::v4i32;
13044 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13047 ExtVT = MVT::v16i32;
13050 ExtVT = MVT::v32i16;
13053 ExtVT = MVT::v64i8;
13057 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13058 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13059 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13060 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13062 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13065 V2 = DAG.getUNDEF(ExtVT);
13066 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13067 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13068 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13069 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13071 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13073 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13074 // i1 was sign extended we can use X86ISD::CVT2MASK.
13075 int NumElems = VT.getVectorNumElements();
13076 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13077 (Subtarget.hasDQI() && (NumElems < 32)))
13078 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13080 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13083 /// Helper function that returns true if the shuffle mask should be
13084 /// commuted to improve canonicalization.
13085 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13086 int NumElements = Mask.size();
13088 int NumV1Elements = 0, NumV2Elements = 0;
13092 else if (M < NumElements)
13097 // Commute the shuffle as needed such that more elements come from V1 than
13098 // V2. This allows us to match the shuffle pattern strictly on how many
13099 // elements come from V1 without handling the symmetric cases.
13100 if (NumV2Elements > NumV1Elements)
13103 assert(NumV1Elements > 0 && "No V1 indices");
13105 if (NumV2Elements == 0)
13108 // When the number of V1 and V2 elements are the same, try to minimize the
13109 // number of uses of V2 in the low half of the vector. When that is tied,
13110 // ensure that the sum of indices for V1 is equal to or lower than the sum
13111 // indices for V2. When those are equal, try to ensure that the number of odd
13112 // indices for V1 is lower than the number of odd indices for V2.
13113 if (NumV1Elements == NumV2Elements) {
13114 int LowV1Elements = 0, LowV2Elements = 0;
13115 for (int M : Mask.slice(0, NumElements / 2))
13116 if (M >= NumElements)
13120 if (LowV2Elements > LowV1Elements)
13122 if (LowV2Elements == LowV1Elements) {
13123 int SumV1Indices = 0, SumV2Indices = 0;
13124 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13125 if (Mask[i] >= NumElements)
13127 else if (Mask[i] >= 0)
13129 if (SumV2Indices < SumV1Indices)
13131 if (SumV2Indices == SumV1Indices) {
13132 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13133 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13134 if (Mask[i] >= NumElements)
13135 NumV2OddIndices += i % 2;
13136 else if (Mask[i] >= 0)
13137 NumV1OddIndices += i % 2;
13138 if (NumV2OddIndices < NumV1OddIndices)
13147 /// \brief Top-level lowering for x86 vector shuffles.
13149 /// This handles decomposition, canonicalization, and lowering of all x86
13150 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13151 /// above in helper routines. The canonicalization attempts to widen shuffles
13152 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13153 /// s.t. only one of the two inputs needs to be tested, etc.
13154 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13155 SelectionDAG &DAG) {
13156 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13157 ArrayRef<int> Mask = SVOp->getMask();
13158 SDValue V1 = Op.getOperand(0);
13159 SDValue V2 = Op.getOperand(1);
13160 MVT VT = Op.getSimpleValueType();
13161 int NumElements = VT.getVectorNumElements();
13163 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13165 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13166 "Can't lower MMX shuffles");
13168 bool V1IsUndef = V1.isUndef();
13169 bool V2IsUndef = V2.isUndef();
13170 if (V1IsUndef && V2IsUndef)
13171 return DAG.getUNDEF(VT);
13173 // When we create a shuffle node we put the UNDEF node to second operand,
13174 // but in some cases the first operand may be transformed to UNDEF.
13175 // In this case we should just commute the node.
13177 return DAG.getCommutedVectorShuffle(*SVOp);
13179 // Check for non-undef masks pointing at an undef vector and make the masks
13180 // undef as well. This makes it easier to match the shuffle based solely on
13184 if (M >= NumElements) {
13185 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13186 for (int &M : NewMask)
13187 if (M >= NumElements)
13189 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13192 // Check for illegal shuffle mask element index values.
13193 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13194 assert(llvm::all_of(Mask,
13195 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13196 "Out of bounds shuffle index");
13198 // We actually see shuffles that are entirely re-arrangements of a set of
13199 // zero inputs. This mostly happens while decomposing complex shuffles into
13200 // simple ones. Directly lower these as a buildvector of zeros.
13201 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13202 if (Zeroable.all())
13203 return getZeroVector(VT, Subtarget, DAG, DL);
13205 // Try to collapse shuffles into using a vector type with fewer elements but
13206 // wider element types. We cap this to not form integers or floating point
13207 // elements wider than 64 bits, but it might be interesting to form i128
13208 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13209 SmallVector<int, 16> WidenedMask;
13210 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13211 canWidenShuffleElements(Mask, WidenedMask)) {
13212 MVT NewEltVT = VT.isFloatingPoint()
13213 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13214 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13215 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13216 // Make sure that the new vector type is legal. For example, v2f64 isn't
13218 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13219 V1 = DAG.getBitcast(NewVT, V1);
13220 V2 = DAG.getBitcast(NewVT, V2);
13221 return DAG.getBitcast(
13222 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13226 // Commute the shuffle if it will improve canonicalization.
13227 if (canonicalizeShuffleMaskWithCommute(Mask))
13228 return DAG.getCommutedVectorShuffle(*SVOp);
13230 // For each vector width, delegate to a specialized lowering routine.
13231 if (VT.is128BitVector())
13232 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13235 if (VT.is256BitVector())
13236 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13239 if (VT.is512BitVector())
13240 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13244 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13246 llvm_unreachable("Unimplemented!");
13249 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13250 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13251 const X86Subtarget &Subtarget,
13252 SelectionDAG &DAG) {
13253 SDValue Cond = Op.getOperand(0);
13254 SDValue LHS = Op.getOperand(1);
13255 SDValue RHS = Op.getOperand(2);
13257 MVT VT = Op.getSimpleValueType();
13259 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13261 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13263 // Only non-legal VSELECTs reach this lowering, convert those into generic
13264 // shuffles and re-use the shuffle lowering path for blends.
13265 SmallVector<int, 32> Mask;
13266 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13267 SDValue CondElt = CondBV->getOperand(i);
13269 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13272 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13275 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13276 // A vselect where all conditions and data are constants can be optimized into
13277 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13278 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13279 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13280 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13283 // Try to lower this to a blend-style vector shuffle. This can handle all
13284 // constant condition cases.
13285 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13288 // Variable blends are only legal from SSE4.1 onward.
13289 if (!Subtarget.hasSSE41())
13292 // Only some types will be legal on some subtargets. If we can emit a legal
13293 // VSELECT-matching blend, return Op, and but if we need to expand, return
13295 switch (Op.getSimpleValueType().SimpleTy) {
13297 // Most of the vector types have blends past SSE4.1.
13301 // The byte blends for AVX vectors were introduced only in AVX2.
13302 if (Subtarget.hasAVX2())
13309 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13310 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13313 // FIXME: We should custom lower this by fixing the condition and using i8
13319 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13320 MVT VT = Op.getSimpleValueType();
13323 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13326 if (VT.getSizeInBits() == 8) {
13327 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13328 Op.getOperand(0), Op.getOperand(1));
13329 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13330 DAG.getValueType(VT));
13331 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13334 if (VT == MVT::f32) {
13335 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13336 // the result back to FR32 register. It's only worth matching if the
13337 // result has a single use which is a store or a bitcast to i32. And in
13338 // the case of a store, it's not worth it if the index is a constant 0,
13339 // because a MOVSSmr can be used instead, which is smaller and faster.
13340 if (!Op.hasOneUse())
13342 SDNode *User = *Op.getNode()->use_begin();
13343 if ((User->getOpcode() != ISD::STORE ||
13344 isNullConstant(Op.getOperand(1))) &&
13345 (User->getOpcode() != ISD::BITCAST ||
13346 User->getValueType(0) != MVT::i32))
13348 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13349 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13351 return DAG.getBitcast(MVT::f32, Extract);
13354 if (VT == MVT::i32 || VT == MVT::i64) {
13355 // ExtractPS/pextrq works with constant index.
13356 if (isa<ConstantSDNode>(Op.getOperand(1)))
13363 /// Extract one bit from mask vector, like v16i1 or v8i1.
13364 /// AVX-512 feature.
13366 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13367 SDValue Vec = Op.getOperand(0);
13369 MVT VecVT = Vec.getSimpleValueType();
13370 SDValue Idx = Op.getOperand(1);
13371 MVT EltVT = Op.getSimpleValueType();
13373 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13374 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13375 "Unexpected vector type in ExtractBitFromMaskVector");
13377 // variable index can't be handled in mask registers,
13378 // extend vector to VR512
13379 if (!isa<ConstantSDNode>(Idx)) {
13380 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13381 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13382 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13383 ExtVT.getVectorElementType(), Ext, Idx);
13384 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13387 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13388 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13389 (VecVT.getVectorNumElements() < 8)) {
13390 // Use kshiftlw/rw instruction.
13391 VecVT = MVT::v16i1;
13392 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13393 DAG.getUNDEF(VecVT),
13395 DAG.getIntPtrConstant(0, dl));
13397 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13398 if (MaxSift - IdxVal)
13399 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13400 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13401 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13402 DAG.getConstant(MaxSift, dl, MVT::i8));
13403 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13404 DAG.getIntPtrConstant(0, dl));
13408 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13409 SelectionDAG &DAG) const {
13411 SDValue Vec = Op.getOperand(0);
13412 MVT VecVT = Vec.getSimpleValueType();
13413 SDValue Idx = Op.getOperand(1);
13415 if (Op.getSimpleValueType() == MVT::i1)
13416 return ExtractBitFromMaskVector(Op, DAG);
13418 if (!isa<ConstantSDNode>(Idx)) {
13419 if (VecVT.is512BitVector() ||
13420 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
13421 VecVT.getScalarSizeInBits() == 32)) {
13424 MVT::getIntegerVT(VecVT.getScalarSizeInBits());
13425 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13426 MaskEltVT.getSizeInBits());
13428 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13429 auto PtrVT = getPointerTy(DAG.getDataLayout());
13430 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13431 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
13432 DAG.getConstant(0, dl, PtrVT));
13433 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13434 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
13435 DAG.getConstant(0, dl, PtrVT));
13440 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13442 // If this is a 256-bit vector result, first extract the 128-bit vector and
13443 // then extract the element from the 128-bit vector.
13444 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13445 // Get the 128-bit vector.
13446 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
13447 MVT EltVT = VecVT.getVectorElementType();
13449 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13450 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
13452 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
13453 // this can be done with a mask.
13454 IdxVal &= ElemsPerChunk - 1;
13455 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13456 DAG.getConstant(IdxVal, dl, MVT::i32));
13459 assert(VecVT.is128BitVector() && "Unexpected vector length");
13461 MVT VT = Op.getSimpleValueType();
13463 if (VT.getSizeInBits() == 16) {
13464 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
13465 // we're going to zero extend the register or fold the store (SSE41 only).
13466 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
13467 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
13468 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13469 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13470 DAG.getBitcast(MVT::v4i32, Vec), Idx));
13472 // Transform it so it match pextrw which produces a 32-bit result.
13473 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13474 Op.getOperand(0), Op.getOperand(1));
13475 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13476 DAG.getValueType(VT));
13477 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13480 if (Subtarget.hasSSE41())
13481 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
13484 // TODO: handle v16i8.
13486 if (VT.getSizeInBits() == 32) {
13490 // SHUFPS the element to the lowest double word, then movss.
13491 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
13492 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13493 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13494 DAG.getIntPtrConstant(0, dl));
13497 if (VT.getSizeInBits() == 64) {
13498 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13499 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13500 // to match extract_elt for f64.
13504 // UNPCKHPD the element to the lowest double word, then movsd.
13505 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13506 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13507 int Mask[2] = { 1, -1 };
13508 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13509 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13510 DAG.getIntPtrConstant(0, dl));
13516 /// Insert one bit to mask vector, like v16i1 or v8i1.
13517 /// AVX-512 feature.
13519 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13521 SDValue Vec = Op.getOperand(0);
13522 SDValue Elt = Op.getOperand(1);
13523 SDValue Idx = Op.getOperand(2);
13524 MVT VecVT = Vec.getSimpleValueType();
13526 if (!isa<ConstantSDNode>(Idx)) {
13527 // Non constant index. Extend source and destination,
13528 // insert element and then truncate the result.
13529 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13530 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13531 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13532 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13533 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13534 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13537 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13538 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13539 unsigned NumElems = VecVT.getVectorNumElements();
13541 if(Vec.isUndef()) {
13543 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13544 DAG.getConstant(IdxVal, dl, MVT::i8));
13548 // Insertion of one bit into first or last position
13549 // can be done with two SHIFTs + OR.
13550 if (IdxVal == 0 ) {
13551 // EltInVec already at correct index and other bits are 0.
13552 // Clean the first bit in source vector.
13553 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13554 DAG.getConstant(1 , dl, MVT::i8));
13555 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13556 DAG.getConstant(1, dl, MVT::i8));
13558 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13560 if (IdxVal == NumElems -1) {
13561 // Move the bit to the last position inside the vector.
13562 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13563 DAG.getConstant(IdxVal, dl, MVT::i8));
13564 // Clean the last bit in the source vector.
13565 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13566 DAG.getConstant(1, dl, MVT::i8));
13567 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13568 DAG.getConstant(1 , dl, MVT::i8));
13570 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13573 // Use shuffle to insert element.
13574 SmallVector<int, 64> MaskVec(NumElems);
13575 for (unsigned i = 0; i != NumElems; ++i)
13576 MaskVec[i] = (i == IdxVal) ? NumElems : i;
13578 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
13581 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13582 SelectionDAG &DAG) const {
13583 MVT VT = Op.getSimpleValueType();
13584 MVT EltVT = VT.getVectorElementType();
13585 unsigned NumElts = VT.getVectorNumElements();
13587 if (EltVT == MVT::i1)
13588 return InsertBitToMaskVector(Op, DAG);
13591 SDValue N0 = Op.getOperand(0);
13592 SDValue N1 = Op.getOperand(1);
13593 SDValue N2 = Op.getOperand(2);
13594 if (!isa<ConstantSDNode>(N2))
13596 auto *N2C = cast<ConstantSDNode>(N2);
13597 unsigned IdxVal = N2C->getZExtValue();
13599 // If we are clearing out a element, we do this more efficiently with a
13600 // blend shuffle than a costly integer insertion.
13601 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
13602 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
13603 // be beneficial if we are inserting several zeros and can combine the masks.
13604 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
13605 SmallVector<int, 8> ClearMask;
13606 for (unsigned i = 0; i != NumElts; ++i)
13607 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
13608 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
13609 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
13612 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13613 // into that, and then insert the subvector back into the result.
13614 if (VT.is256BitVector() || VT.is512BitVector()) {
13615 // With a 256-bit vector, we can insert into the zero element efficiently
13616 // using a blend if we have AVX or AVX2 and the right data type.
13617 if (VT.is256BitVector() && IdxVal == 0) {
13618 // TODO: It is worthwhile to cast integer to floating point and back
13619 // and incur a domain crossing penalty if that's what we'll end up
13620 // doing anyway after extracting to a 128-bit vector.
13621 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13622 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
13623 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
13624 N2 = DAG.getIntPtrConstant(1, dl);
13625 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
13629 // Get the desired 128-bit vector chunk.
13630 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
13632 // Insert the element into the desired chunk.
13633 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13634 assert(isPowerOf2_32(NumEltsIn128));
13635 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
13636 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
13638 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13639 DAG.getConstant(IdxIn128, dl, MVT::i32));
13641 // Insert the changed part back into the bigger vector
13642 return insert128BitVector(N0, V, IdxVal, DAG, dl);
13644 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13646 if (Subtarget.hasSSE41()) {
13647 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13649 if (VT == MVT::v8i16) {
13650 Opc = X86ISD::PINSRW;
13652 assert(VT == MVT::v16i8);
13653 Opc = X86ISD::PINSRB;
13656 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13658 if (N1.getValueType() != MVT::i32)
13659 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13660 if (N2.getValueType() != MVT::i32)
13661 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13662 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13665 if (EltVT == MVT::f32) {
13666 // Bits [7:6] of the constant are the source select. This will always be
13667 // zero here. The DAG Combiner may combine an extract_elt index into
13668 // these bits. For example (insert (extract, 3), 2) could be matched by
13669 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
13670 // Bits [5:4] of the constant are the destination select. This is the
13671 // value of the incoming immediate.
13672 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13673 // combine either bitwise AND or insert of float 0.0 to set these bits.
13675 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
13676 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
13677 // If this is an insertion of 32-bits into the low 32-bits of
13678 // a vector, we prefer to generate a blend with immediate rather
13679 // than an insertps. Blends are simpler operations in hardware and so
13680 // will always have equal or better performance than insertps.
13681 // But if optimizing for size and there's a load folding opportunity,
13682 // generate insertps because blendps does not have a 32-bit memory
13684 N2 = DAG.getIntPtrConstant(1, dl);
13685 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13686 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
13688 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
13689 // Create this as a scalar to vector..
13690 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13691 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13694 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13695 // PINSR* works with constant index.
13700 if (EltVT == MVT::i8)
13703 if (EltVT.getSizeInBits() == 16) {
13704 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13705 // as its second argument.
13706 if (N1.getValueType() != MVT::i32)
13707 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13708 if (N2.getValueType() != MVT::i32)
13709 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13710 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13715 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13717 MVT OpVT = Op.getSimpleValueType();
13719 // If this is a 256-bit vector result, first insert into a 128-bit
13720 // vector and then insert into the 256-bit vector.
13721 if (!OpVT.is128BitVector()) {
13722 // Insert into a 128-bit vector.
13723 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13724 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13725 OpVT.getVectorNumElements() / SizeFactor);
13727 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13729 // Insert the 128-bit vector.
13730 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13733 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13734 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13735 return DAG.getBitcast(
13736 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
13739 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13740 // a simple subregister reference or explicit instructions to grab
13741 // upper bits of a vector.
13742 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13743 SelectionDAG &DAG) {
13744 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
13747 SDValue In = Op.getOperand(0);
13748 SDValue Idx = Op.getOperand(1);
13749 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13750 MVT ResVT = Op.getSimpleValueType();
13752 assert((In.getSimpleValueType().is256BitVector() ||
13753 In.getSimpleValueType().is512BitVector()) &&
13754 "Can only extract from 256-bit or 512-bit vectors");
13756 if (ResVT.is128BitVector())
13757 return extract128BitVector(In, IdxVal, DAG, dl);
13758 if (ResVT.is256BitVector())
13759 return extract256BitVector(In, IdxVal, DAG, dl);
13761 llvm_unreachable("Unimplemented!");
13764 static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
13765 for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
13766 if (llvm::all_of(ValidUsers,
13767 [&I](SDValue V) { return V.getNode() != *I; }))
13772 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13773 // simple superregister reference or explicit instructions to insert
13774 // the upper bits of a vector.
13775 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13776 SelectionDAG &DAG) {
13777 assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
13780 SDValue Vec = Op.getOperand(0);
13781 SDValue SubVec = Op.getOperand(1);
13782 SDValue Idx = Op.getOperand(2);
13784 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13785 MVT OpVT = Op.getSimpleValueType();
13786 MVT SubVecVT = SubVec.getSimpleValueType();
13788 if (OpVT.getVectorElementType() == MVT::i1)
13789 return insert1BitVector(Op, DAG, Subtarget);
13791 assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13792 "Can only insert into 256-bit or 512-bit vectors");
13794 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
13796 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13797 // (load16 addr + 16), Elts/2)
13800 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13801 // (load32 addr + 32), Elts/2)
13803 // or a 16-byte or 32-byte broadcast:
13804 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13805 // (load16 addr), Elts/2)
13806 // --> X86SubVBroadcast(load16 addr)
13808 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13809 // (load32 addr), Elts/2)
13810 // --> X86SubVBroadcast(load32 addr)
13811 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
13812 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
13813 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
13814 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
13815 if (Idx2 && Idx2->getZExtValue() == 0) {
13816 SDValue SubVec2 = Vec.getOperand(1);
13817 // If needed, look through bitcasts to get to the load.
13818 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
13820 unsigned Alignment = FirstLd->getAlignment();
13821 unsigned AS = FirstLd->getAddressSpace();
13822 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
13823 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
13824 OpVT, AS, Alignment, &Fast) && Fast) {
13825 SDValue Ops[] = {SubVec2, SubVec};
13826 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
13830 // If lower/upper loads are the same and the only users of the load, then
13831 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
13832 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
13833 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
13834 areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
13835 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
13838 // If this is subv_broadcast insert into both halves, use a larger
13840 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
13841 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
13842 SubVec.getOperand(0));
13847 if (SubVecVT.is128BitVector())
13848 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
13850 if (SubVecVT.is256BitVector())
13851 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
13853 llvm_unreachable("Unimplemented!");
13856 // Returns the appropriate wrapper opcode for a global reference.
13857 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
13858 // References to absolute symbols are never PC-relative.
13859 if (GV && GV->isAbsoluteSymbolRef())
13860 return X86ISD::Wrapper;
13862 CodeModel::Model M = getTargetMachine().getCodeModel();
13863 if (Subtarget.isPICStyleRIPRel() &&
13864 (M == CodeModel::Small || M == CodeModel::Kernel))
13865 return X86ISD::WrapperRIP;
13867 return X86ISD::Wrapper;
13870 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
13871 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
13872 // one of the above mentioned nodes. It has to be wrapped because otherwise
13873 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
13874 // be used to form addressing mode. These wrapped nodes will be selected
13877 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
13878 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
13880 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13881 // global base reg.
13882 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13884 auto PtrVT = getPointerTy(DAG.getDataLayout());
13885 SDValue Result = DAG.getTargetConstantPool(
13886 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
13888 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13889 // With PIC, the address is actually $g + Offset.
13892 DAG.getNode(ISD::ADD, DL, PtrVT,
13893 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13899 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
13900 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
13902 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13903 // global base reg.
13904 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
13906 auto PtrVT = getPointerTy(DAG.getDataLayout());
13907 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
13909 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13911 // With PIC, the address is actually $g + Offset.
13914 DAG.getNode(ISD::ADD, DL, PtrVT,
13915 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13921 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
13922 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
13924 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
13925 // global base reg.
13926 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
13927 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
13929 auto PtrVT = getPointerTy(DAG.getDataLayout());
13930 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
13933 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
13935 // With PIC, the address is actually $g + Offset.
13936 if (isPositionIndependent() && !Subtarget.is64Bit()) {
13938 DAG.getNode(ISD::ADD, DL, PtrVT,
13939 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
13942 // For symbols that require a load from a stub to get the address, emit the
13944 if (isGlobalStubReference(OpFlag))
13945 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
13946 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
13952 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
13953 // Create the TargetBlockAddressAddress node.
13954 unsigned char OpFlags =
13955 Subtarget.classifyBlockAddressReference();
13956 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
13957 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
13959 auto PtrVT = getPointerTy(DAG.getDataLayout());
13960 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
13961 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
13963 // With PIC, the address is actually $g + Offset.
13964 if (isGlobalRelativeToPICBase(OpFlags)) {
13965 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
13966 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
13972 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
13973 const SDLoc &dl, int64_t Offset,
13974 SelectionDAG &DAG) const {
13975 // Create the TargetGlobalAddress node, folding in the constant
13976 // offset if it is legal.
13977 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
13978 CodeModel::Model M = DAG.getTarget().getCodeModel();
13979 auto PtrVT = getPointerTy(DAG.getDataLayout());
13981 if (OpFlags == X86II::MO_NO_FLAG &&
13982 X86::isOffsetSuitableForCodeModel(Offset, M)) {
13983 // A direct static reference to a global.
13984 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
13987 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
13990 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
13992 // With PIC, the address is actually $g + Offset.
13993 if (isGlobalRelativeToPICBase(OpFlags)) {
13994 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
13995 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
13998 // For globals that require a load from a stub to get the address, emit the
14000 if (isGlobalStubReference(OpFlags))
14001 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14002 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14004 // If there was a non-zero offset that we didn't fold, create an explicit
14005 // addition for it.
14007 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14008 DAG.getConstant(Offset, dl, PtrVT));
14014 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14015 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14016 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14017 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14021 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14022 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14023 unsigned char OperandFlags, bool LocalDynamic = false) {
14024 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14025 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14027 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14028 GA->getValueType(0),
14032 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14036 SDValue Ops[] = { Chain, TGA, *InFlag };
14037 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14039 SDValue Ops[] = { Chain, TGA };
14040 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14043 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14044 MFI.setAdjustsStack(true);
14045 MFI.setHasCalls(true);
14047 SDValue Flag = Chain.getValue(1);
14048 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14051 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14053 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14056 SDLoc dl(GA); // ? function entry point might be better
14057 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14058 DAG.getNode(X86ISD::GlobalBaseReg,
14059 SDLoc(), PtrVT), InFlag);
14060 InFlag = Chain.getValue(1);
14062 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14065 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14067 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14069 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14070 X86::RAX, X86II::MO_TLSGD);
14073 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14079 // Get the start address of the TLS block for this module.
14080 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14081 .getInfo<X86MachineFunctionInfo>();
14082 MFI->incNumLocalDynamicTLSAccesses();
14086 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14087 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14090 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14091 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14092 InFlag = Chain.getValue(1);
14093 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14094 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14097 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14101 unsigned char OperandFlags = X86II::MO_DTPOFF;
14102 unsigned WrapperKind = X86ISD::Wrapper;
14103 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14104 GA->getValueType(0),
14105 GA->getOffset(), OperandFlags);
14106 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14108 // Add x@dtpoff with the base.
14109 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14112 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14113 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14114 const EVT PtrVT, TLSModel::Model model,
14115 bool is64Bit, bool isPIC) {
14118 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14119 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14120 is64Bit ? 257 : 256));
14122 SDValue ThreadPointer =
14123 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14124 MachinePointerInfo(Ptr));
14126 unsigned char OperandFlags = 0;
14127 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14129 unsigned WrapperKind = X86ISD::Wrapper;
14130 if (model == TLSModel::LocalExec) {
14131 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14132 } else if (model == TLSModel::InitialExec) {
14134 OperandFlags = X86II::MO_GOTTPOFF;
14135 WrapperKind = X86ISD::WrapperRIP;
14137 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14140 llvm_unreachable("Unexpected model");
14143 // emit "addl x@ntpoff,%eax" (local exec)
14144 // or "addl x@indntpoff,%eax" (initial exec)
14145 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14147 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14148 GA->getOffset(), OperandFlags);
14149 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14151 if (model == TLSModel::InitialExec) {
14152 if (isPIC && !is64Bit) {
14153 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14154 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14158 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14159 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14162 // The address of the thread local variable is the add of the thread
14163 // pointer with the offset of the variable.
14164 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14168 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14170 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14172 if (DAG.getTarget().Options.EmulatedTLS)
14173 return LowerToTLSEmulatedModel(GA, DAG);
14175 const GlobalValue *GV = GA->getGlobal();
14176 auto PtrVT = getPointerTy(DAG.getDataLayout());
14177 bool PositionIndependent = isPositionIndependent();
14179 if (Subtarget.isTargetELF()) {
14180 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14182 case TLSModel::GeneralDynamic:
14183 if (Subtarget.is64Bit())
14184 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14185 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14186 case TLSModel::LocalDynamic:
14187 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14188 Subtarget.is64Bit());
14189 case TLSModel::InitialExec:
14190 case TLSModel::LocalExec:
14191 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14192 PositionIndependent);
14194 llvm_unreachable("Unknown TLS model.");
14197 if (Subtarget.isTargetDarwin()) {
14198 // Darwin only has one model of TLS. Lower to that.
14199 unsigned char OpFlag = 0;
14200 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14201 X86ISD::WrapperRIP : X86ISD::Wrapper;
14203 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14204 // global base reg.
14205 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14207 OpFlag = X86II::MO_TLVP_PIC_BASE;
14209 OpFlag = X86II::MO_TLVP;
14211 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14212 GA->getValueType(0),
14213 GA->getOffset(), OpFlag);
14214 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14216 // With PIC32, the address is actually $g + Offset.
14218 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14219 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14222 // Lowering the machine isd will make sure everything is in the right
14224 SDValue Chain = DAG.getEntryNode();
14225 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14226 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14227 SDValue Args[] = { Chain, Offset };
14228 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14229 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14230 DAG.getIntPtrConstant(0, DL, true),
14231 Chain.getValue(1), DL);
14233 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14234 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14235 MFI.setAdjustsStack(true);
14237 // And our return value (tls address) is in the standard call return value
14239 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14240 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14243 if (Subtarget.isTargetKnownWindowsMSVC() ||
14244 Subtarget.isTargetWindowsItanium() ||
14245 Subtarget.isTargetWindowsGNU()) {
14246 // Just use the implicit TLS architecture
14247 // Need to generate someting similar to:
14248 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14250 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14251 // mov rcx, qword [rdx+rcx*8]
14252 // mov eax, .tls$:tlsvar
14253 // [rax+rcx] contains the address
14254 // Windows 64bit: gs:0x58
14255 // Windows 32bit: fs:__tls_array
14258 SDValue Chain = DAG.getEntryNode();
14260 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14261 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14262 // use its literal value of 0x2C.
14263 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14264 ? Type::getInt8PtrTy(*DAG.getContext(),
14266 : Type::getInt32PtrTy(*DAG.getContext(),
14269 SDValue TlsArray = Subtarget.is64Bit()
14270 ? DAG.getIntPtrConstant(0x58, dl)
14271 : (Subtarget.isTargetWindowsGNU()
14272 ? DAG.getIntPtrConstant(0x2C, dl)
14273 : DAG.getExternalSymbol("_tls_array", PtrVT));
14275 SDValue ThreadPointer =
14276 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14279 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14280 res = ThreadPointer;
14282 // Load the _tls_index variable
14283 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14284 if (Subtarget.is64Bit())
14285 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14286 MachinePointerInfo(), MVT::i32);
14288 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14290 auto &DL = DAG.getDataLayout();
14292 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14293 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14295 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14298 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14300 // Get the offset of start of .tls section
14301 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14302 GA->getValueType(0),
14303 GA->getOffset(), X86II::MO_SECREL);
14304 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14306 // The address of the thread local variable is the add of the thread
14307 // pointer with the offset of the variable.
14308 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14311 llvm_unreachable("TLS not implemented for this target.");
14314 /// Lower SRA_PARTS and friends, which return two i32 values
14315 /// and take a 2 x i32 value to shift plus a shift amount.
14316 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14317 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14318 MVT VT = Op.getSimpleValueType();
14319 unsigned VTBits = VT.getSizeInBits();
14321 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14322 SDValue ShOpLo = Op.getOperand(0);
14323 SDValue ShOpHi = Op.getOperand(1);
14324 SDValue ShAmt = Op.getOperand(2);
14325 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14326 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14328 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14329 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14330 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14331 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14332 : DAG.getConstant(0, dl, VT);
14334 SDValue Tmp2, Tmp3;
14335 if (Op.getOpcode() == ISD::SHL_PARTS) {
14336 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14337 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14339 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14340 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14343 // If the shift amount is larger or equal than the width of a part we can't
14344 // rely on the results of shld/shrd. Insert a test and select the appropriate
14345 // values for large shift amounts.
14346 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14347 DAG.getConstant(VTBits, dl, MVT::i8));
14348 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14349 AndNode, DAG.getConstant(0, dl, MVT::i8));
14352 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14353 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14354 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14356 if (Op.getOpcode() == ISD::SHL_PARTS) {
14357 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14358 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14360 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14361 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14364 SDValue Ops[2] = { Lo, Hi };
14365 return DAG.getMergeValues(Ops, dl);
14368 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14369 SelectionDAG &DAG) const {
14370 SDValue Src = Op.getOperand(0);
14371 MVT SrcVT = Src.getSimpleValueType();
14372 MVT VT = Op.getSimpleValueType();
14375 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14376 if (SrcVT.isVector()) {
14377 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14378 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14379 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14380 DAG.getUNDEF(SrcVT)));
14382 if (SrcVT.getVectorElementType() == MVT::i1) {
14383 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14384 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14385 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14386 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14387 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14388 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14393 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14394 "Unknown SINT_TO_FP to lower!");
14396 // These are really Legal; return the operand so the caller accepts it as
14398 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14400 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14401 Subtarget.is64Bit()) {
14405 SDValue ValueToStore = Op.getOperand(0);
14406 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14407 !Subtarget.is64Bit())
14408 // Bitcasting to f64 here allows us to do a single 64-bit store from
14409 // an SSE register, avoiding the store forwarding penalty that would come
14410 // with two 32-bit stores.
14411 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14413 unsigned Size = SrcVT.getSizeInBits()/8;
14414 MachineFunction &MF = DAG.getMachineFunction();
14415 auto PtrVT = getPointerTy(MF.getDataLayout());
14416 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14417 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14418 SDValue Chain = DAG.getStore(
14419 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14420 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14421 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14424 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14426 SelectionDAG &DAG) const {
14430 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14432 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14434 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14436 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14438 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14439 MachineMemOperand *MMO;
14441 int SSFI = FI->getIndex();
14442 MMO = DAG.getMachineFunction().getMachineMemOperand(
14443 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14444 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14446 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14447 StackSlot = StackSlot.getOperand(1);
14449 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14450 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14452 Tys, Ops, SrcVT, MMO);
14455 Chain = Result.getValue(1);
14456 SDValue InFlag = Result.getValue(2);
14458 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14459 // shouldn't be necessary except that RFP cannot be live across
14460 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14461 MachineFunction &MF = DAG.getMachineFunction();
14462 unsigned SSFISize = Op.getValueSizeInBits()/8;
14463 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
14464 auto PtrVT = getPointerTy(MF.getDataLayout());
14465 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14466 Tys = DAG.getVTList(MVT::Other);
14468 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14470 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14471 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14472 MachineMemOperand::MOStore, SSFISize, SSFISize);
14474 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14475 Ops, Op.getValueType(), MMO);
14476 Result = DAG.getLoad(
14477 Op.getValueType(), DL, Chain, StackSlot,
14478 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14484 /// 64-bit unsigned integer to double expansion.
14485 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14486 SelectionDAG &DAG) const {
14487 // This algorithm is not obvious. Here it is what we're trying to output:
14490 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14491 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14493 haddpd %xmm0, %xmm0
14495 pshufd $0x4e, %xmm0, %xmm1
14501 LLVMContext *Context = DAG.getContext();
14503 // Build some magic constants.
14504 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14505 Constant *C0 = ConstantDataVector::get(*Context, CV0);
14506 auto PtrVT = getPointerTy(DAG.getDataLayout());
14507 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
14509 SmallVector<Constant*,2> CV1;
14511 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14512 APInt(64, 0x4330000000000000ULL))));
14514 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14515 APInt(64, 0x4530000000000000ULL))));
14516 Constant *C1 = ConstantVector::get(CV1);
14517 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
14519 // Load the 64-bit value into an XMM register.
14520 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14523 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14524 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14525 /* Alignment = */ 16);
14527 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
14530 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14531 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14532 /* Alignment = */ 16);
14533 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
14534 // TODO: Are there any fast-math-flags to propagate here?
14535 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14538 if (Subtarget.hasSSE3()) {
14539 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14540 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14542 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
14543 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
14544 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14545 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
14548 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14549 DAG.getIntPtrConstant(0, dl));
14552 /// 32-bit unsigned integer to float expansion.
14553 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14554 SelectionDAG &DAG) const {
14556 // FP constant to bias correct the final result.
14557 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
14560 // Load the 32-bit value into an XMM register.
14561 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14564 // Zero out the upper parts of the register.
14565 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14567 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14568 DAG.getBitcast(MVT::v2f64, Load),
14569 DAG.getIntPtrConstant(0, dl));
14571 // Or the load with the bias.
14572 SDValue Or = DAG.getNode(
14573 ISD::OR, dl, MVT::v2i64,
14574 DAG.getBitcast(MVT::v2i64,
14575 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
14576 DAG.getBitcast(MVT::v2i64,
14577 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
14579 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14580 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
14582 // Subtract the bias.
14583 // TODO: Are there any fast-math-flags to propagate here?
14584 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14586 // Handle final rounding.
14587 MVT DestVT = Op.getSimpleValueType();
14589 if (DestVT.bitsLT(MVT::f64))
14590 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14591 DAG.getIntPtrConstant(0, dl));
14592 if (DestVT.bitsGT(MVT::f64))
14593 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14595 // Handle final rounding.
14599 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
14600 const X86Subtarget &Subtarget, SDLoc &DL) {
14601 if (Op.getSimpleValueType() != MVT::v2f64)
14604 SDValue N0 = Op.getOperand(0);
14605 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
14607 // Legalize to v4i32 type.
14608 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
14609 DAG.getUNDEF(MVT::v2i32));
14611 if (Subtarget.hasAVX512())
14612 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
14614 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
14615 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
14616 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
14617 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
14619 // Two to the power of half-word-size.
14620 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
14622 // Clear upper part of LO, lower HI.
14623 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
14624 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
14626 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
14627 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
14628 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
14630 // Add the two halves.
14631 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
14634 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14635 const X86Subtarget &Subtarget) {
14636 // The algorithm is the following:
14637 // #ifdef __SSE4_1__
14638 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14639 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14640 // (uint4) 0x53000000, 0xaa);
14642 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14643 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14645 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14646 // return (float4) lo + fhi;
14648 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
14649 // reassociate the two FADDs, and if we do that, the algorithm fails
14650 // spectacularly (PR24512).
14651 // FIXME: If we ever have some kind of Machine FMF, this should be marked
14652 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
14653 // there's also the MachineCombiner reassociations happening on Machine IR.
14654 if (DAG.getTarget().Options.UnsafeFPMath)
14658 SDValue V = Op->getOperand(0);
14659 MVT VecIntVT = V.getSimpleValueType();
14660 bool Is128 = VecIntVT == MVT::v4i32;
14661 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14662 // If we convert to something else than the supported type, e.g., to v4f64,
14664 if (VecFloatVT != Op->getSimpleValueType(0))
14667 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14668 "Unsupported custom type");
14670 // In the #idef/#else code, we have in common:
14671 // - The vector of constants:
14677 // Create the splat vector for 0x4b000000.
14678 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
14679 // Create the splat vector for 0x53000000.
14680 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
14682 // Create the right shift.
14683 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
14684 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14687 if (Subtarget.hasSSE41()) {
14688 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14689 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14690 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
14691 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
14692 // Low will be bitcasted right away, so do not bother bitcasting back to its
14694 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14695 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14696 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14697 // (uint4) 0x53000000, 0xaa);
14698 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
14699 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
14700 // High will be bitcasted right away, so do not bother bitcasting back to
14701 // its original type.
14702 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14703 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14705 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
14706 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14707 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14708 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14710 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14711 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14714 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14715 SDValue VecCstFAdd = DAG.getConstantFP(
14716 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
14718 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14719 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
14720 // TODO: Are there any fast-math-flags to propagate here?
14722 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14723 // return (float4) lo + fhi;
14724 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
14725 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14728 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14729 SelectionDAG &DAG) const {
14730 SDValue N0 = Op.getOperand(0);
14731 MVT SrcVT = N0.getSimpleValueType();
14734 if (SrcVT.getVectorElementType() == MVT::i1) {
14735 if (SrcVT == MVT::v2i1)
14736 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14737 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
14738 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14739 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14740 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
14743 switch (SrcVT.SimpleTy) {
14745 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14750 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14751 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14752 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14755 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
14758 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
14761 assert(Subtarget.hasAVX512());
14762 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14763 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
14767 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14768 SelectionDAG &DAG) const {
14769 SDValue N0 = Op.getOperand(0);
14771 auto PtrVT = getPointerTy(DAG.getDataLayout());
14773 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14774 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14775 // the optimization here.
14776 if (DAG.SignBitIsZero(N0))
14777 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14779 if (Op.getSimpleValueType().isVector())
14780 return lowerUINT_TO_FP_vec(Op, DAG);
14782 MVT SrcVT = N0.getSimpleValueType();
14783 MVT DstVT = Op.getSimpleValueType();
14785 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
14786 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
14787 // Conversions from unsigned i32 to f32/f64 are legal,
14788 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
14792 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14793 return LowerUINT_TO_FP_i64(Op, DAG);
14794 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14795 return LowerUINT_TO_FP_i32(Op, DAG);
14796 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14799 // Make a 64-bit buffer, and use it to build an FILD.
14800 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14801 if (SrcVT == MVT::i32) {
14802 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
14803 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14804 StackSlot, MachinePointerInfo());
14805 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
14806 OffsetSlot, MachinePointerInfo());
14807 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
14811 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
14812 SDValue ValueToStore = Op.getOperand(0);
14813 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
14814 // Bitcasting to f64 here allows us to do a single 64-bit store from
14815 // an SSE register, avoiding the store forwarding penalty that would come
14816 // with two 32-bit stores.
14817 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14818 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14819 MachinePointerInfo());
14820 // For i64 source, we need to add the appropriate power of 2 if the input
14821 // was negative. This is the same as the optimization in
14822 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
14823 // we must be careful to do the computation in x87 extended precision, not
14824 // in SSE. (The generic code can't know it's OK to do this, or how to.)
14825 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
14826 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14827 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14828 MachineMemOperand::MOLoad, 8, 8);
14830 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
14831 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
14832 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
14835 APInt FF(32, 0x5F800000ULL);
14837 // Check whether the sign bit is set.
14838 SDValue SignSet = DAG.getSetCC(
14839 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
14840 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
14842 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
14843 SDValue FudgePtr = DAG.getConstantPool(
14844 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
14846 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
14847 SDValue Zero = DAG.getIntPtrConstant(0, dl);
14848 SDValue Four = DAG.getIntPtrConstant(4, dl);
14849 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
14851 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
14853 // Load the value out, extending it from f32 to f80.
14854 // FIXME: Avoid the extend by constructing the right constant pool?
14855 SDValue Fudge = DAG.getExtLoad(
14856 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
14857 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
14858 /* Alignment = */ 4);
14859 // Extend everything to 80 bits to force it to be done on x87.
14860 // TODO: Are there any fast-math-flags to propagate here?
14861 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
14862 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
14863 DAG.getIntPtrConstant(0, dl));
14866 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
14867 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
14868 // just return an <SDValue(), SDValue()> pair.
14869 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
14870 // to i16, i32 or i64, and we lower it to a legal sequence.
14871 // If lowered to the final integer result we return a <result, SDValue()> pair.
14872 // Otherwise we lower it to a sequence ending with a FIST, return a
14873 // <FIST, StackSlot> pair, and the caller is responsible for loading
14874 // the final integer result from StackSlot.
14875 std::pair<SDValue,SDValue>
14876 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
14877 bool IsSigned, bool IsReplace) const {
14880 EVT DstTy = Op.getValueType();
14881 EVT TheVT = Op.getOperand(0).getValueType();
14882 auto PtrVT = getPointerTy(DAG.getDataLayout());
14884 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
14885 // f16 must be promoted before using the lowering in this routine.
14886 // fp128 does not use this lowering.
14887 return std::make_pair(SDValue(), SDValue());
14890 // If using FIST to compute an unsigned i64, we'll need some fixup
14891 // to handle values above the maximum signed i64. A FIST is always
14892 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
14893 bool UnsignedFixup = !IsSigned &&
14894 DstTy == MVT::i64 &&
14895 (!Subtarget.is64Bit() ||
14896 !isScalarFPTypeInSSEReg(TheVT));
14898 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
14899 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
14900 // The low 32 bits of the fist result will have the correct uint32 result.
14901 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
14905 assert(DstTy.getSimpleVT() <= MVT::i64 &&
14906 DstTy.getSimpleVT() >= MVT::i16 &&
14907 "Unknown FP_TO_INT to lower!");
14909 // These are really Legal.
14910 if (DstTy == MVT::i32 &&
14911 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14912 return std::make_pair(SDValue(), SDValue());
14913 if (Subtarget.is64Bit() &&
14914 DstTy == MVT::i64 &&
14915 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
14916 return std::make_pair(SDValue(), SDValue());
14918 // We lower FP->int64 into FISTP64 followed by a load from a temporary
14920 MachineFunction &MF = DAG.getMachineFunction();
14921 unsigned MemSize = DstTy.getSizeInBits()/8;
14922 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
14923 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14926 switch (DstTy.getSimpleVT().SimpleTy) {
14927 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
14928 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
14929 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
14930 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
14933 SDValue Chain = DAG.getEntryNode();
14934 SDValue Value = Op.getOperand(0);
14935 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
14937 if (UnsignedFixup) {
14939 // Conversion to unsigned i64 is implemented with a select,
14940 // depending on whether the source value fits in the range
14941 // of a signed i64. Let Thresh be the FP equivalent of
14942 // 0x8000000000000000ULL.
14944 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
14945 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
14946 // Fist-to-mem64 FistSrc
14947 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
14948 // to XOR'ing the high 32 bits with Adjust.
14950 // Being a power of 2, Thresh is exactly representable in all FP formats.
14951 // For X87 we'd like to use the smallest FP type for this constant, but
14952 // for DAG type consistency we have to match the FP operand type.
14954 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
14955 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
14956 bool LosesInfo = false;
14957 if (TheVT == MVT::f64)
14958 // The rounding mode is irrelevant as the conversion should be exact.
14959 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
14961 else if (TheVT == MVT::f80)
14962 Status = Thresh.convert(APFloat::x87DoubleExtended(),
14963 APFloat::rmNearestTiesToEven, &LosesInfo);
14965 assert(Status == APFloat::opOK && !LosesInfo &&
14966 "FP conversion should have been exact");
14968 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
14970 SDValue Cmp = DAG.getSetCC(DL,
14971 getSetCCResultType(DAG.getDataLayout(),
14972 *DAG.getContext(), TheVT),
14973 Value, ThreshVal, ISD::SETLT);
14974 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
14975 DAG.getConstant(0, DL, MVT::i32),
14976 DAG.getConstant(0x80000000, DL, MVT::i32));
14977 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
14978 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
14979 *DAG.getContext(), TheVT),
14980 Value, ThreshVal, ISD::SETLT);
14981 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
14984 // FIXME This causes a redundant load/store if the SSE-class value is already
14985 // in memory, such as if it is on the callstack.
14986 if (isScalarFPTypeInSSEReg(TheVT)) {
14987 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
14988 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
14989 MachinePointerInfo::getFixedStack(MF, SSFI));
14990 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
14992 Chain, StackSlot, DAG.getValueType(TheVT)
14995 MachineMemOperand *MMO =
14996 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
14997 MachineMemOperand::MOLoad, MemSize, MemSize);
14998 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
14999 Chain = Value.getValue(1);
15000 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15001 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15004 MachineMemOperand *MMO =
15005 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15006 MachineMemOperand::MOStore, MemSize, MemSize);
15008 if (UnsignedFixup) {
15010 // Insert the FIST, load its result as two i32's,
15011 // and XOR the high i32 with Adjust.
15013 SDValue FistOps[] = { Chain, Value, StackSlot };
15014 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15015 FistOps, DstTy, MMO);
15018 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15019 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15022 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15023 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15025 if (Subtarget.is64Bit()) {
15026 // Join High32 and Low32 into a 64-bit result.
15027 // (High32 << 32) | Low32
15028 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15029 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15030 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15031 DAG.getConstant(32, DL, MVT::i8));
15032 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15033 return std::make_pair(Result, SDValue());
15036 SDValue ResultOps[] = { Low32, High32 };
15038 SDValue pair = IsReplace
15039 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15040 : DAG.getMergeValues(ResultOps, DL);
15041 return std::make_pair(pair, SDValue());
15043 // Build the FP_TO_INT*_IN_MEM
15044 SDValue Ops[] = { Chain, Value, StackSlot };
15045 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15047 return std::make_pair(FIST, StackSlot);
15051 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15052 const X86Subtarget &Subtarget) {
15053 MVT VT = Op->getSimpleValueType(0);
15054 SDValue In = Op->getOperand(0);
15055 MVT InVT = In.getSimpleValueType();
15058 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15059 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15061 // Optimize vectors in AVX mode:
15064 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15065 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15066 // Concat upper and lower parts.
15069 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15070 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15071 // Concat upper and lower parts.
15074 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15075 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15076 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15079 if (Subtarget.hasInt256())
15080 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15082 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15083 SDValue Undef = DAG.getUNDEF(InVT);
15084 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15085 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15086 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15088 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15089 VT.getVectorNumElements()/2);
15091 OpLo = DAG.getBitcast(HVT, OpLo);
15092 OpHi = DAG.getBitcast(HVT, OpHi);
15094 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15097 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15098 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15099 MVT VT = Op->getSimpleValueType(0);
15100 SDValue In = Op->getOperand(0);
15101 MVT InVT = In.getSimpleValueType();
15103 unsigned NumElts = VT.getVectorNumElements();
15104 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
15107 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1)
15108 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15110 assert(InVT.getVectorElementType() == MVT::i1);
15112 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15114 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15115 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15118 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15120 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15122 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15124 return SelectedVal;
15125 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15128 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15129 SelectionDAG &DAG) {
15130 if (Subtarget.hasFp256())
15131 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15137 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15138 SelectionDAG &DAG) {
15140 MVT VT = Op.getSimpleValueType();
15141 SDValue In = Op.getOperand(0);
15142 MVT SVT = In.getSimpleValueType();
15144 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15145 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15147 if (Subtarget.hasFp256())
15148 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15151 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15152 VT.getVectorNumElements() != SVT.getVectorNumElements());
15156 /// Helper to recursively truncate vector elements in half with PACKSS.
15157 /// It makes use of the fact that vector comparison results will be all-zeros
15158 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15159 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15160 /// within each 128-bit lane.
15161 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15164 const X86Subtarget &Subtarget) {
15165 // Requires SSE2 but AVX512 has fast truncate.
15166 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15169 EVT SrcVT = In.getValueType();
15171 // No truncation required, we might get here due to recursive calls.
15172 if (SrcVT == DstVT)
15175 // We only support vector truncation to 128bits or greater from a
15176 // 256bits or greater source.
15177 if ((DstVT.getSizeInBits() % 128) != 0)
15179 if ((SrcVT.getSizeInBits() % 256) != 0)
15182 unsigned NumElems = SrcVT.getVectorNumElements();
15183 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15184 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15187 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15189 // Extract lower/upper subvectors.
15190 unsigned NumSubElts = NumElems / 2;
15191 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15192 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15193 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15195 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15196 if (SrcVT.is256BitVector()) {
15197 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15198 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15199 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15200 return DAG.getBitcast(DstVT, Res);
15203 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15204 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15205 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15206 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15207 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15208 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15210 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15211 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15212 Res = DAG.getBitcast(MVT::v4i64, Res);
15213 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15215 if (DstVT.is256BitVector())
15216 return DAG.getBitcast(DstVT, Res);
15218 // If 512bit -> 128bit truncate another stage.
15219 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15220 Res = DAG.getBitcast(PackedVT, Res);
15221 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15224 // Recursively pack lower/upper subvectors, concat result and pack again.
15225 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15226 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15227 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15228 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15230 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15231 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15232 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15235 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15236 const X86Subtarget &Subtarget) {
15239 MVT VT = Op.getSimpleValueType();
15240 SDValue In = Op.getOperand(0);
15241 MVT InVT = In.getSimpleValueType();
15243 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15245 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15246 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15247 if (InVT.getScalarSizeInBits() <= 16) {
15248 if (Subtarget.hasBWI()) {
15249 // legal, will go to VPMOVB2M, VPMOVW2M
15250 // Shift packed bytes not supported natively, bitcast to word
15251 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15252 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15253 DAG.getBitcast(ExtVT, In),
15254 DAG.getConstant(ShiftInx, DL, ExtVT));
15255 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15256 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15258 // Use TESTD/Q, extended vector to packed dword/qword.
15259 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15260 "Unexpected vector type.");
15261 unsigned NumElts = InVT.getVectorNumElements();
15262 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15263 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15265 ShiftInx = InVT.getScalarSizeInBits() - 1;
15268 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15269 DAG.getConstant(ShiftInx, DL, InVT));
15270 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15273 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15275 MVT VT = Op.getSimpleValueType();
15276 SDValue In = Op.getOperand(0);
15277 MVT InVT = In.getSimpleValueType();
15279 if (VT == MVT::i1) {
15280 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15281 "Invalid scalar TRUNCATE operation");
15282 if (InVT.getSizeInBits() >= 32)
15284 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15285 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15287 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15288 "Invalid TRUNCATE operation");
15290 if (VT.getVectorElementType() == MVT::i1)
15291 return LowerTruncateVecI1(Op, DAG, Subtarget);
15293 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15294 if (Subtarget.hasAVX512()) {
15295 // word to byte only under BWI
15296 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15297 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15298 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
15299 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15302 // Truncate with PACKSS if we are truncating a vector comparison result.
15303 // TODO: We should be able to support other operations as long as we
15304 // we are saturating+packing zero/all bits only.
15305 auto IsPackableComparison = [](SDValue V) {
15306 unsigned Opcode = V.getOpcode();
15307 return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
15308 Opcode == X86ISD::CMPP);
15311 if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
15312 all_of(In->ops(), IsPackableComparison))) {
15313 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15317 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15318 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15319 if (Subtarget.hasInt256()) {
15320 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15321 In = DAG.getBitcast(MVT::v8i32, In);
15322 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
15324 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15325 DAG.getIntPtrConstant(0, DL));
15328 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15329 DAG.getIntPtrConstant(0, DL));
15330 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15331 DAG.getIntPtrConstant(2, DL));
15332 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15333 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15334 static const int ShufMask[] = {0, 2, 4, 6};
15335 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15338 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15339 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
15340 if (Subtarget.hasInt256()) {
15341 In = DAG.getBitcast(MVT::v32i8, In);
15343 SmallVector<SDValue,32> pshufbMask;
15344 for (unsigned i = 0; i < 2; ++i) {
15345 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
15346 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
15347 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
15348 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
15349 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
15350 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
15351 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
15352 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
15353 for (unsigned j = 0; j < 8; ++j)
15354 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
15356 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
15357 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
15358 In = DAG.getBitcast(MVT::v4i64, In);
15360 static const int ShufMask[] = {0, 2, -1, -1};
15361 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
15363 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15364 DAG.getIntPtrConstant(0, DL));
15365 return DAG.getBitcast(VT, In);
15368 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15369 DAG.getIntPtrConstant(0, DL));
15371 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15372 DAG.getIntPtrConstant(4, DL));
15374 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15375 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15377 // The PSHUFB mask:
15378 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15379 -1, -1, -1, -1, -1, -1, -1, -1};
15381 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
15382 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
15383 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
15385 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15386 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15388 // The MOVLHPS Mask:
15389 static const int ShufMask2[] = {0, 1, 4, 5};
15390 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15391 return DAG.getBitcast(MVT::v8i16, res);
15394 // Handle truncation of V256 to V128 using shuffles.
15395 if (!VT.is128BitVector() || !InVT.is256BitVector())
15398 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15400 unsigned NumElems = VT.getVectorNumElements();
15401 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15403 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15404 // Prepare truncation shuffle mask
15405 for (unsigned i = 0; i != NumElems; ++i)
15406 MaskVec[i] = i * 2;
15407 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
15408 DAG.getUNDEF(NVT), MaskVec);
15409 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15410 DAG.getIntPtrConstant(0, DL));
15413 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
15414 const X86Subtarget &Subtarget,
15415 SelectionDAG &DAG) const {
15416 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15418 MVT VT = Op.getSimpleValueType();
15420 if (VT.isVector()) {
15421 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15422 SDValue Src = Op.getOperand(0);
15424 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15425 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
15427 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15428 DAG.getUNDEF(MVT::v2f32)));
15434 assert(!VT.isVector());
15436 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15437 IsSigned, /*IsReplace=*/ false);
15438 SDValue FIST = Vals.first, StackSlot = Vals.second;
15439 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15440 if (!FIST.getNode())
15443 if (StackSlot.getNode())
15444 // Load the result.
15445 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15447 // The node is the result.
15451 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15453 MVT VT = Op.getSimpleValueType();
15454 SDValue In = Op.getOperand(0);
15455 MVT SVT = In.getSimpleValueType();
15457 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15459 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15460 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15461 In, DAG.getUNDEF(SVT)));
15464 /// The only differences between FABS and FNEG are the mask and the logic op.
15465 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15466 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15467 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15468 "Wrong opcode for lowering FABS or FNEG.");
15470 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15472 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15473 // into an FNABS. We'll lower the FABS after that if it is still in use.
15475 for (SDNode *User : Op->uses())
15476 if (User->getOpcode() == ISD::FNEG)
15480 MVT VT = Op.getSimpleValueType();
15482 bool IsF128 = (VT == MVT::f128);
15484 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15485 // decide if we should generate a 16-byte constant mask when we only need 4 or
15486 // 8 bytes for the scalar case.
15491 if (VT.isVector()) {
15493 EltVT = VT.getVectorElementType();
15494 } else if (IsF128) {
15495 // SSE instructions are used for optimized f128 logical operations.
15496 LogicVT = MVT::f128;
15499 // There are no scalar bitwise logical SSE/AVX instructions, so we
15500 // generate a 16-byte vector constant and logic op even for the scalar case.
15501 // Using a 16-byte mask allows folding the load of the mask with
15502 // the logic op, so it can save (~4 bytes) on code size.
15503 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15507 unsigned EltBits = EltVT.getSizeInBits();
15508 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15510 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
15511 const fltSemantics &Sem =
15512 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
15513 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15514 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
15516 SDValue Op0 = Op.getOperand(0);
15517 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15519 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15520 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15522 if (VT.isVector() || IsF128)
15523 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15525 // For the scalar case extend to a 128-bit vector, perform the logic op,
15526 // and extract the scalar result back out.
15527 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
15528 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15529 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
15530 DAG.getIntPtrConstant(0, dl));
15533 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15534 SDValue Mag = Op.getOperand(0);
15535 SDValue Sign = Op.getOperand(1);
15538 // If the sign operand is smaller, extend it first.
15539 MVT VT = Op.getSimpleValueType();
15540 if (Sign.getSimpleValueType().bitsLT(VT))
15541 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
15543 // And if it is bigger, shrink it first.
15544 if (Sign.getSimpleValueType().bitsGT(VT))
15545 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
15547 // At this point the operands and the result should have the same
15548 // type, and that won't be f80 since that is not custom lowered.
15549 bool IsF128 = (VT == MVT::f128);
15550 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
15551 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
15552 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
15553 "Unexpected type in LowerFCOPYSIGN");
15555 MVT EltVT = VT.getScalarType();
15556 const fltSemantics &Sem =
15557 EltVT == MVT::f64 ? APFloat::IEEEdouble()
15558 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15560 // Perform all scalar logic operations as 16-byte vectors because there are no
15561 // scalar FP logic instructions in SSE.
15562 // TODO: This isn't necessary. If we used scalar types, we might avoid some
15563 // unnecessary splats, but we might miss load folding opportunities. Should
15564 // this decision be based on OptimizeForSize?
15565 bool IsFakeVector = !VT.isVector() && !IsF128;
15568 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15570 // The mask constants are automatically splatted for vector types.
15571 unsigned EltSizeInBits = VT.getScalarSizeInBits();
15572 SDValue SignMask = DAG.getConstantFP(
15573 APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15574 SDValue MagMask = DAG.getConstantFP(
15575 APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15577 // First, clear all bits but the sign bit from the second operand (sign).
15579 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
15580 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
15582 // Next, clear the sign bit from the first operand (magnitude).
15583 // TODO: If we had general constant folding for FP logic ops, this check
15584 // wouldn't be necessary.
15586 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
15587 APFloat APF = Op0CN->getValueAPF();
15589 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
15591 // If the magnitude operand wasn't a constant, we need to AND out the sign.
15593 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
15594 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
15597 // OR the magnitude value with the sign bit.
15598 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
15599 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
15600 DAG.getIntPtrConstant(0, dl));
15603 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15604 SDValue N0 = Op.getOperand(0);
15606 MVT VT = Op.getSimpleValueType();
15608 MVT OpVT = N0.getSimpleValueType();
15609 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
15610 "Unexpected type for FGETSIGN");
15612 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
15613 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
15614 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
15615 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
15616 Res = DAG.getZExtOrTrunc(Res, dl, VT);
15617 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
15621 // Check whether an OR'd tree is PTEST-able.
15622 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
15623 SelectionDAG &DAG) {
15624 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15626 if (!Subtarget.hasSSE41())
15629 if (!Op->hasOneUse())
15632 SDNode *N = Op.getNode();
15635 SmallVector<SDValue, 8> Opnds;
15636 DenseMap<SDValue, unsigned> VecInMap;
15637 SmallVector<SDValue, 8> VecIns;
15638 EVT VT = MVT::Other;
15640 // Recognize a special case where a vector is casted into wide integer to
15642 Opnds.push_back(N->getOperand(0));
15643 Opnds.push_back(N->getOperand(1));
15645 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15646 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15647 // BFS traverse all OR'd operands.
15648 if (I->getOpcode() == ISD::OR) {
15649 Opnds.push_back(I->getOperand(0));
15650 Opnds.push_back(I->getOperand(1));
15651 // Re-evaluate the number of nodes to be traversed.
15652 e += 2; // 2 more nodes (LHS and RHS) are pushed.
15656 // Quit if a non-EXTRACT_VECTOR_ELT
15657 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15660 // Quit if without a constant index.
15661 SDValue Idx = I->getOperand(1);
15662 if (!isa<ConstantSDNode>(Idx))
15665 SDValue ExtractedFromVec = I->getOperand(0);
15666 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15667 if (M == VecInMap.end()) {
15668 VT = ExtractedFromVec.getValueType();
15669 // Quit if not 128/256-bit vector.
15670 if (!VT.is128BitVector() && !VT.is256BitVector())
15672 // Quit if not the same type.
15673 if (VecInMap.begin() != VecInMap.end() &&
15674 VT != VecInMap.begin()->first.getValueType())
15676 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15677 VecIns.push_back(ExtractedFromVec);
15679 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15682 assert((VT.is128BitVector() || VT.is256BitVector()) &&
15683 "Not extracted from 128-/256-bit vector.");
15685 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15687 for (DenseMap<SDValue, unsigned>::const_iterator
15688 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15689 // Quit if not all elements are used.
15690 if (I->second != FullMask)
15694 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15696 // Cast all vectors into TestVT for PTEST.
15697 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15698 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
15700 // If more than one full vectors are evaluated, OR them first before PTEST.
15701 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15702 // Each iteration will OR 2 nodes and append the result until there is only
15703 // 1 node left, i.e. the final OR'd value of all vectors.
15704 SDValue LHS = VecIns[Slot];
15705 SDValue RHS = VecIns[Slot + 1];
15706 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15709 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15710 VecIns.back(), VecIns.back());
15713 /// \brief return true if \c Op has a use that doesn't just read flags.
15714 static bool hasNonFlagsUse(SDValue Op) {
15715 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15717 SDNode *User = *UI;
15718 unsigned UOpNo = UI.getOperandNo();
15719 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15720 // Look pass truncate.
15721 UOpNo = User->use_begin().getOperandNo();
15722 User = *User->use_begin();
15725 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15726 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15732 // Emit KTEST instruction for bit vectors on AVX-512
15733 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
15734 const X86Subtarget &Subtarget) {
15735 if (Op.getOpcode() == ISD::BITCAST) {
15736 auto hasKTEST = [&](MVT VT) {
15737 unsigned SizeInBits = VT.getSizeInBits();
15738 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
15739 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
15741 SDValue Op0 = Op.getOperand(0);
15742 MVT Op0VT = Op0.getValueType().getSimpleVT();
15743 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
15745 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
15750 /// Emit nodes that will be selected as "test Op0,Op0", or something
15752 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
15753 SelectionDAG &DAG) const {
15754 if (Op.getValueType() == MVT::i1) {
15755 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15756 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15757 DAG.getConstant(0, dl, MVT::i8));
15759 // CF and OF aren't always set the way we want. Determine which
15760 // of these we need.
15761 bool NeedCF = false;
15762 bool NeedOF = false;
15765 case X86::COND_A: case X86::COND_AE:
15766 case X86::COND_B: case X86::COND_BE:
15769 case X86::COND_G: case X86::COND_GE:
15770 case X86::COND_L: case X86::COND_LE:
15771 case X86::COND_O: case X86::COND_NO: {
15772 // Check if we really need to set the
15773 // Overflow flag. If NoSignedWrap is present
15774 // that is not actually needed.
15775 switch (Op->getOpcode()) {
15780 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
15781 if (BinNode->Flags.hasNoSignedWrap())
15791 // See if we can use the EFLAGS value from the operand instead of
15792 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15793 // we prove that the arithmetic won't overflow, we can't use OF or CF.
15794 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15795 // Emit KTEST for bit vectors
15796 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15798 // Emit a CMP with 0, which is the TEST pattern.
15799 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15800 DAG.getConstant(0, dl, Op.getValueType()));
15802 unsigned Opcode = 0;
15803 unsigned NumOperands = 0;
15805 // Truncate operations may prevent the merge of the SETCC instruction
15806 // and the arithmetic instruction before it. Attempt to truncate the operands
15807 // of the arithmetic instruction and use a reduced bit-width instruction.
15808 bool NeedTruncation = false;
15809 SDValue ArithOp = Op;
15810 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
15811 SDValue Arith = Op->getOperand(0);
15812 // Both the trunc and the arithmetic op need to have one user each.
15813 if (Arith->hasOneUse())
15814 switch (Arith.getOpcode()) {
15821 NeedTruncation = true;
15827 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
15828 // which may be the result of a CAST. We use the variable 'Op', which is the
15829 // non-casted variable when we check for possible users.
15830 switch (ArithOp.getOpcode()) {
15832 // Due to an isel shortcoming, be conservative if this add is likely to be
15833 // selected as part of a load-modify-store instruction. When the root node
15834 // in a match is a store, isel doesn't know how to remap non-chain non-flag
15835 // uses of other nodes in the match, such as the ADD in this case. This
15836 // leads to the ADD being left around and reselected, with the result being
15837 // two adds in the output. Alas, even if none our users are stores, that
15838 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
15839 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
15840 // climbing the DAG back to the root, and it doesn't seem to be worth the
15842 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15843 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15844 if (UI->getOpcode() != ISD::CopyToReg &&
15845 UI->getOpcode() != ISD::SETCC &&
15846 UI->getOpcode() != ISD::STORE)
15849 if (ConstantSDNode *C =
15850 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
15851 // An add of one will be selected as an INC.
15852 if (C->isOne() && !Subtarget.slowIncDec()) {
15853 Opcode = X86ISD::INC;
15858 // An add of negative one (subtract of one) will be selected as a DEC.
15859 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
15860 Opcode = X86ISD::DEC;
15866 // Otherwise use a regular EFLAGS-setting add.
15867 Opcode = X86ISD::ADD;
15872 // If we have a constant logical shift that's only used in a comparison
15873 // against zero turn it into an equivalent AND. This allows turning it into
15874 // a TEST instruction later.
15875 if ((X86CC == X86::COND_E || X86CC == X86::COND_NE) && Op->hasOneUse() &&
15876 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
15877 EVT VT = Op.getValueType();
15878 unsigned BitWidth = VT.getSizeInBits();
15879 unsigned ShAmt = Op->getConstantOperandVal(1);
15880 if (ShAmt >= BitWidth) // Avoid undefined shifts.
15882 APInt Mask = ArithOp.getOpcode() == ISD::SRL
15883 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
15884 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
15885 if (!Mask.isSignedIntN(32)) // Avoid large immediates.
15887 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
15888 DAG.getConstant(Mask, dl, VT));
15893 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
15894 // because a TEST instruction will be better.
15895 if (!hasNonFlagsUse(Op)) {
15896 SDValue Op0 = ArithOp->getOperand(0);
15897 SDValue Op1 = ArithOp->getOperand(1);
15898 EVT VT = ArithOp.getValueType();
15899 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
15900 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
15902 // But if we can combine this into an ANDN operation, then create an AND
15903 // now and allow it to be pattern matched into an ANDN.
15904 if (!Subtarget.hasBMI() || !isAndn || !isLegalAndnType)
15911 // Due to the ISEL shortcoming noted above, be conservative if this op is
15912 // likely to be selected as part of a load-modify-store instruction.
15913 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
15914 UE = Op.getNode()->use_end(); UI != UE; ++UI)
15915 if (UI->getOpcode() == ISD::STORE)
15918 // Otherwise use a regular EFLAGS-setting instruction.
15919 switch (ArithOp.getOpcode()) {
15920 default: llvm_unreachable("unexpected operator!");
15921 case ISD::SUB: Opcode = X86ISD::SUB; break;
15922 case ISD::XOR: Opcode = X86ISD::XOR; break;
15923 case ISD::AND: Opcode = X86ISD::AND; break;
15925 if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
15926 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
15929 Opcode = X86ISD::OR;
15943 return SDValue(Op.getNode(), 1);
15949 // If we found that truncation is beneficial, perform the truncation and
15951 if (NeedTruncation) {
15952 EVT VT = Op.getValueType();
15953 SDValue WideVal = Op->getOperand(0);
15954 EVT WideVT = WideVal.getValueType();
15955 unsigned ConvertedOp = 0;
15956 // Use a target machine opcode to prevent further DAGCombine
15957 // optimizations that may separate the arithmetic operations
15958 // from the setcc node.
15959 switch (WideVal.getOpcode()) {
15961 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
15962 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
15963 case ISD::AND: ConvertedOp = X86ISD::AND; break;
15964 case ISD::OR: ConvertedOp = X86ISD::OR; break;
15965 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
15969 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15970 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
15971 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
15972 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
15973 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
15979 // Emit KTEST for bit vectors
15980 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15983 // Emit a CMP with 0, which is the TEST pattern.
15984 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15985 DAG.getConstant(0, dl, Op.getValueType()));
15987 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
15988 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
15990 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
15991 DAG.ReplaceAllUsesWith(Op, New);
15992 return SDValue(New.getNode(), 1);
15995 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
15997 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
15998 const SDLoc &dl, SelectionDAG &DAG) const {
15999 if (isNullConstant(Op1))
16000 return EmitTest(Op0, X86CC, dl, DAG);
16002 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16003 "Unexpected comparison operation for MVT::i1 operands");
16005 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16006 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16007 // Only promote the compare up to I32 if it is a 16 bit operation
16008 // with an immediate. 16 bit immediates are to be avoided.
16009 if ((Op0.getValueType() == MVT::i16 &&
16010 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16011 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16012 !Subtarget.isAtom()) {
16013 unsigned ExtendOp =
16014 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16015 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16016 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16018 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16019 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16020 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16022 return SDValue(Sub.getNode(), 1);
16024 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16027 /// Convert a comparison if required by the subtarget.
16028 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16029 SelectionDAG &DAG) const {
16030 // If the subtarget does not support the FUCOMI instruction, floating-point
16031 // comparisons have to be converted.
16032 if (Subtarget.hasCMov() ||
16033 Cmp.getOpcode() != X86ISD::CMP ||
16034 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16035 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16038 // The instruction selector will select an FUCOM instruction instead of
16039 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16040 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16041 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16043 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16044 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16045 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16046 DAG.getConstant(8, dl, MVT::i8));
16047 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16049 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16050 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16051 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16054 /// Check if replacement of SQRT with RSQRT should be disabled.
16055 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16056 EVT VT = Op.getValueType();
16058 // We never want to use both SQRT and RSQRT instructions for the same input.
16059 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16063 return Subtarget.hasFastVectorFSQRT();
16064 return Subtarget.hasFastScalarFSQRT();
16067 /// The minimum architected relative accuracy is 2^-12. We need one
16068 /// Newton-Raphson step to have a good float result (24 bits of precision).
16069 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16070 SelectionDAG &DAG, int Enabled,
16071 int &RefinementSteps,
16072 bool &UseOneConstNR,
16073 bool Reciprocal) const {
16074 EVT VT = Op.getValueType();
16076 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16077 // TODO: Add support for AVX512 (v16f32).
16078 // It is likely not profitable to do this for f64 because a double-precision
16079 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16080 // instructions: convert to single, rsqrtss, convert back to double, refine
16081 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16082 // along with FMA, this could be a throughput win.
16083 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16084 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16085 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16086 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16087 RefinementSteps = 1;
16089 UseOneConstNR = false;
16090 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16095 /// The minimum architected relative accuracy is 2^-12. We need one
16096 /// Newton-Raphson step to have a good float result (24 bits of precision).
16097 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16099 int &RefinementSteps) const {
16100 EVT VT = Op.getValueType();
16102 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16103 // TODO: Add support for AVX512 (v16f32).
16104 // It is likely not profitable to do this for f64 because a double-precision
16105 // reciprocal estimate with refinement on x86 prior to FMA requires
16106 // 15 instructions: convert to single, rcpss, convert back to double, refine
16107 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16108 // along with FMA, this could be a throughput win.
16110 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16111 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16112 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16113 // Enable estimate codegen with 1 refinement step for vector division.
16114 // Scalar division estimates are disabled because they break too much
16115 // real-world code. These defaults are intended to match GCC behavior.
16116 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16119 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16120 RefinementSteps = 1;
16122 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16127 /// If we have at least two divisions that use the same divisor, convert to
16128 /// multplication by a reciprocal. This may need to be adjusted for a given
16129 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16130 /// This is because we still need one division to calculate the reciprocal and
16131 /// then we need two multiplies by that reciprocal as replacements for the
16132 /// original divisions.
16133 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16137 /// Helper for creating a X86ISD::SETCC node.
16138 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16139 SelectionDAG &DAG) {
16140 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16141 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16144 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16145 /// according to equal/not-equal condition code \p CC.
16146 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16147 const SDLoc &dl, SelectionDAG &DAG) {
16148 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16149 // instruction. Since the shift amount is in-range-or-undefined, we know
16150 // that doing a bittest on the i32 value is ok. We extend to i32 because
16151 // the encoding for the i16 version is larger than the i32 version.
16152 // Also promote i16 to i32 for performance / code size reason.
16153 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16154 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16156 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16157 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16158 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16159 // known to be zero.
16160 if (Src.getValueType() == MVT::i64 &&
16161 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16162 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16164 // If the operand types disagree, extend the shift amount to match. Since
16165 // BT ignores high bits (like shifts) we can use anyextend.
16166 if (Src.getValueType() != BitNo.getValueType())
16167 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16169 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16170 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16171 return getSETCC(Cond, BT, dl , DAG);
16174 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16175 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16176 const SDLoc &dl, SelectionDAG &DAG) {
16177 SDValue Op0 = And.getOperand(0);
16178 SDValue Op1 = And.getOperand(1);
16179 if (Op0.getOpcode() == ISD::TRUNCATE)
16180 Op0 = Op0.getOperand(0);
16181 if (Op1.getOpcode() == ISD::TRUNCATE)
16182 Op1 = Op1.getOperand(0);
16185 if (Op1.getOpcode() == ISD::SHL)
16186 std::swap(Op0, Op1);
16187 if (Op0.getOpcode() == ISD::SHL) {
16188 if (isOneConstant(Op0.getOperand(0))) {
16189 // If we looked past a truncate, check that it's only truncating away
16191 unsigned BitWidth = Op0.getValueSizeInBits();
16192 unsigned AndBitWidth = And.getValueSizeInBits();
16193 if (BitWidth > AndBitWidth) {
16195 DAG.computeKnownBits(Op0, Zeros, Ones);
16196 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16200 RHS = Op0.getOperand(1);
16202 } else if (Op1.getOpcode() == ISD::Constant) {
16203 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16204 uint64_t AndRHSVal = AndRHS->getZExtValue();
16205 SDValue AndLHS = Op0;
16207 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16208 LHS = AndLHS.getOperand(0);
16209 RHS = AndLHS.getOperand(1);
16212 // Use BT if the immediate can't be encoded in a TEST instruction.
16213 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16215 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16220 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16225 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16226 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16227 const SDLoc &dl, SelectionDAG &DAG) {
16229 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16230 "Expected TRUNCATE to i1 node");
16232 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16235 SDValue ShiftRight = Op.getOperand(0);
16236 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16240 /// Result of 'and' or 'trunc to i1' is compared against zero.
16241 /// Change to a BT node if possible.
16242 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16243 const SDLoc &dl, SelectionDAG &DAG) const {
16244 if (Op.getOpcode() == ISD::AND)
16245 return LowerAndToBT(Op, CC, dl, DAG);
16246 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16247 return LowerTruncateToBT(Op, CC, dl, DAG);
16251 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16253 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16258 // SSE Condition code mapping:
16267 switch (SetCCOpcode) {
16268 default: llvm_unreachable("Unexpected SETCC condition");
16270 case ISD::SETEQ: SSECC = 0; break;
16272 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16274 case ISD::SETOLT: SSECC = 1; break;
16276 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16278 case ISD::SETOLE: SSECC = 2; break;
16279 case ISD::SETUO: SSECC = 3; break;
16281 case ISD::SETNE: SSECC = 4; break;
16282 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16283 case ISD::SETUGE: SSECC = 5; break;
16284 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16285 case ISD::SETUGT: SSECC = 6; break;
16286 case ISD::SETO: SSECC = 7; break;
16288 case ISD::SETONE: SSECC = 8; break;
16291 std::swap(Op0, Op1);
16296 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16297 /// concatenate the result back.
16298 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16299 MVT VT = Op.getSimpleValueType();
16301 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16302 "Unsupported value type for operation");
16304 unsigned NumElems = VT.getVectorNumElements();
16306 SDValue CC = Op.getOperand(2);
16308 // Extract the LHS vectors
16309 SDValue LHS = Op.getOperand(0);
16310 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16311 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16313 // Extract the RHS vectors
16314 SDValue RHS = Op.getOperand(1);
16315 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16316 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16318 // Issue the operation on the smaller types and concatenate the result back
16319 MVT EltVT = VT.getVectorElementType();
16320 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16321 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16322 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16323 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16326 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16327 SDValue Op0 = Op.getOperand(0);
16328 SDValue Op1 = Op.getOperand(1);
16329 SDValue CC = Op.getOperand(2);
16330 MVT VT = Op.getSimpleValueType();
16333 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16334 "Unexpected type for boolean compare operation");
16335 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16336 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16337 DAG.getConstant(-1, dl, VT));
16338 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16339 DAG.getConstant(-1, dl, VT));
16340 switch (SetCCOpcode) {
16341 default: llvm_unreachable("Unexpected SETCC condition");
16343 // (x == y) -> ~(x ^ y)
16344 return DAG.getNode(ISD::XOR, dl, VT,
16345 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16346 DAG.getConstant(-1, dl, VT));
16348 // (x != y) -> (x ^ y)
16349 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16352 // (x > y) -> (x & ~y)
16353 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16356 // (x < y) -> (~x & y)
16357 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16360 // (x <= y) -> (~x | y)
16361 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16364 // (x >=y) -> (x | ~y)
16365 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16369 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16371 SDValue Op0 = Op.getOperand(0);
16372 SDValue Op1 = Op.getOperand(1);
16373 SDValue CC = Op.getOperand(2);
16374 MVT VT = Op.getSimpleValueType();
16377 assert(VT.getVectorElementType() == MVT::i1 &&
16378 "Cannot set masked compare for this operation");
16380 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16382 bool Unsigned = false;
16385 switch (SetCCOpcode) {
16386 default: llvm_unreachable("Unexpected SETCC condition");
16387 case ISD::SETNE: SSECC = 4; break;
16388 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16389 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16390 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16391 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16392 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16393 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16394 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16395 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16396 case ISD::SETLE: SSECC = 2; break;
16400 std::swap(Op0, Op1);
16402 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16403 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16404 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16405 DAG.getConstant(SSECC, dl, MVT::i8));
16408 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16409 /// operand \p Op1. If non-trivial (for example because it's not constant)
16410 /// return an empty value.
16411 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16412 SelectionDAG &DAG) {
16413 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16417 MVT VT = Op1.getSimpleValueType();
16418 MVT EVT = VT.getVectorElementType();
16419 unsigned n = VT.getVectorNumElements();
16420 SmallVector<SDValue, 8> ULTOp1;
16422 for (unsigned i = 0; i < n; ++i) {
16423 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16424 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16427 // Avoid underflow.
16428 APInt Val = Elt->getAPIntValue();
16432 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16435 return DAG.getBuildVector(VT, dl, ULTOp1);
16438 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16439 SelectionDAG &DAG) {
16440 SDValue Op0 = Op.getOperand(0);
16441 SDValue Op1 = Op.getOperand(1);
16442 SDValue CC = Op.getOperand(2);
16443 MVT VT = Op.getSimpleValueType();
16444 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16445 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
16450 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
16451 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
16455 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
16456 assert(VT.getVectorNumElements() <= 16);
16457 Opc = X86ISD::CMPM;
16459 Opc = X86ISD::CMPP;
16460 // The SSE/AVX packed FP comparison nodes are defined with a
16461 // floating-point vector result that matches the operand type. This allows
16462 // them to work with an SSE1 target (integer vector types are not legal).
16463 VT = Op0.getSimpleValueType();
16466 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
16467 // emit two comparisons and a logic op to tie them together.
16468 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
16471 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
16473 // LLVM predicate is SETUEQ or SETONE.
16475 unsigned CombineOpc;
16476 if (SetCCOpcode == ISD::SETUEQ) {
16479 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
16480 static_cast<unsigned>(ISD::OR);
16482 assert(SetCCOpcode == ISD::SETONE);
16485 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
16486 static_cast<unsigned>(ISD::AND);
16489 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16490 DAG.getConstant(CC0, dl, MVT::i8));
16491 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16492 DAG.getConstant(CC1, dl, MVT::i8));
16493 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
16495 // Handle all other FP comparisons here.
16496 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
16497 DAG.getConstant(SSECC, dl, MVT::i8));
16500 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
16501 // result type of SETCC. The bitcast is expected to be optimized away
16502 // during combining/isel.
16503 if (Opc == X86ISD::CMPP)
16504 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
16509 MVT VTOp0 = Op0.getSimpleValueType();
16510 assert(VTOp0 == Op1.getSimpleValueType() &&
16511 "Expected operands with same type!");
16512 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
16513 "Invalid number of packed elements for source and destination!");
16515 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
16516 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
16517 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
16518 // legalizer firstly checks if the first operand in input to the setcc has
16519 // a legal type. If so, then it promotes the return type to that same type.
16520 // Otherwise, the return type is promoted to the 'next legal type' which,
16521 // for a vector of MVT::i1 is always a 128-bit integer vector type.
16523 // We reach this code only if the following two conditions are met:
16524 // 1. Both return type and operand type have been promoted to wider types
16525 // by the type legalizer.
16526 // 2. The original operand type has been promoted to a 256-bit vector.
16528 // Note that condition 2. only applies for AVX targets.
16529 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
16530 return DAG.getZExtOrTrunc(NewOp, dl, VT);
16533 // The non-AVX512 code below works under the assumption that source and
16534 // destination types are the same.
16535 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
16536 "Value types for source and destination must be the same!");
16538 // Break 256-bit integer vector compare into smaller ones.
16539 if (VT.is256BitVector() && !Subtarget.hasInt256())
16540 return Lower256IntVSETCC(Op, DAG);
16542 // Operands are boolean (vectors of i1)
16543 MVT OpVT = Op1.getSimpleValueType();
16544 if (OpVT.getVectorElementType() == MVT::i1)
16545 return LowerBoolVSETCC_AVX512(Op, DAG);
16547 // The result is boolean, but operands are int/float
16548 if (VT.getVectorElementType() == MVT::i1) {
16549 // In AVX-512 architecture setcc returns mask with i1 elements,
16550 // But there is no compare instruction for i8 and i16 elements in KNL.
16551 // In this case use SSE compare
16552 bool UseAVX512Inst =
16553 (OpVT.is512BitVector() ||
16554 OpVT.getScalarSizeInBits() >= 32 ||
16555 (Subtarget.hasBWI() && Subtarget.hasVLX()));
16558 return LowerIntVSETCC_AVX512(Op, DAG);
16560 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16561 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
16564 // Lower using XOP integer comparisons.
16565 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
16566 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
16567 // Translate compare code to XOP PCOM compare mode.
16568 unsigned CmpMode = 0;
16569 switch (SetCCOpcode) {
16570 default: llvm_unreachable("Unexpected SETCC condition");
16572 case ISD::SETLT: CmpMode = 0x00; break;
16574 case ISD::SETLE: CmpMode = 0x01; break;
16576 case ISD::SETGT: CmpMode = 0x02; break;
16578 case ISD::SETGE: CmpMode = 0x03; break;
16579 case ISD::SETEQ: CmpMode = 0x04; break;
16580 case ISD::SETNE: CmpMode = 0x05; break;
16583 // Are we comparing unsigned or signed integers?
16584 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
16585 ? X86ISD::VPCOMU : X86ISD::VPCOM;
16587 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16588 DAG.getConstant(CmpMode, dl, MVT::i8));
16591 // We are handling one of the integer comparisons here. Since SSE only has
16592 // GT and EQ comparisons for integer, swapping operands and multiple
16593 // operations may be required for some comparisons.
16595 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
16596 bool Subus = false;
16598 switch (SetCCOpcode) {
16599 default: llvm_unreachable("Unexpected SETCC condition");
16600 case ISD::SETNE: Invert = true;
16601 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
16602 case ISD::SETLT: Swap = true;
16603 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
16604 case ISD::SETGE: Swap = true;
16605 case ISD::SETLE: Opc = X86ISD::PCMPGT;
16606 Invert = true; break;
16607 case ISD::SETULT: Swap = true;
16608 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
16609 FlipSigns = true; break;
16610 case ISD::SETUGE: Swap = true;
16611 case ISD::SETULE: Opc = X86ISD::PCMPGT;
16612 FlipSigns = true; Invert = true; break;
16615 // Special case: Use min/max operations for SETULE/SETUGE
16616 MVT VET = VT.getVectorElementType();
16618 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
16619 || (Subtarget.hasSSE2() && (VET == MVT::i8));
16622 switch (SetCCOpcode) {
16624 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
16625 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
16628 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
16631 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
16632 if (!MinMax && hasSubus) {
16633 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
16635 // t = psubus Op0, Op1
16636 // pcmpeq t, <0..0>
16637 switch (SetCCOpcode) {
16639 case ISD::SETULT: {
16640 // If the comparison is against a constant we can turn this into a
16641 // setule. With psubus, setule does not require a swap. This is
16642 // beneficial because the constant in the register is no longer
16643 // destructed as the destination so it can be hoisted out of a loop.
16644 // Only do this pre-AVX since vpcmp* is no longer destructive.
16645 if (Subtarget.hasAVX())
16647 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
16649 Subus = true; Invert = false; Swap = false;
16653 // Psubus is better than flip-sign because it requires no inversion.
16654 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
16655 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
16659 Opc = X86ISD::SUBUS;
16665 std::swap(Op0, Op1);
16667 // Check that the operation in question is available (most are plain SSE2,
16668 // but PCMPGTQ and PCMPEQQ have different requirements).
16669 if (VT == MVT::v2i64) {
16670 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
16671 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
16673 // First cast everything to the right type.
16674 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16675 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16677 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16678 // bits of the inputs before performing those operations. The lower
16679 // compare is always unsigned.
16682 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
16684 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
16685 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
16686 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
16688 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
16689 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
16691 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
16692 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
16693 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
16695 // Create masks for only the low parts/high parts of the 64 bit integers.
16696 static const int MaskHi[] = { 1, 1, 3, 3 };
16697 static const int MaskLo[] = { 0, 0, 2, 2 };
16698 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
16699 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
16700 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
16702 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
16703 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
16706 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16708 return DAG.getBitcast(VT, Result);
16711 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
16712 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
16713 // pcmpeqd + pshufd + pand.
16714 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
16716 // First cast everything to the right type.
16717 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16718 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16721 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
16723 // Make sure the lower and upper halves are both all-ones.
16724 static const int Mask[] = { 1, 0, 3, 2 };
16725 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
16726 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
16729 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16731 return DAG.getBitcast(VT, Result);
16735 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16736 // bits of the inputs before performing those operations.
16738 MVT EltVT = VT.getVectorElementType();
16739 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
16741 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
16742 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
16745 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16747 // If the logical-not of the result is required, perform that now.
16749 Result = DAG.getNOT(dl, Result, VT);
16752 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16755 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16756 getZeroVector(VT, Subtarget, DAG, dl));
16761 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
16763 MVT VT = Op.getSimpleValueType();
16765 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
16767 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
16768 && "SetCC type must be 8-bit or 1-bit integer");
16769 SDValue Op0 = Op.getOperand(0);
16770 SDValue Op1 = Op.getOperand(1);
16772 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
16774 // Optimize to BT if possible.
16775 // Lower (X & (1 << N)) == 0 to BT(X, N).
16776 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
16777 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
16778 // Lower (trunc (X >> N) to i1) to BT(X, N).
16779 if (Op0.hasOneUse() && isNullConstant(Op1) &&
16780 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16781 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
16783 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
16788 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
16790 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
16791 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16793 // If the input is a setcc, then reuse the input setcc or use a new one with
16794 // the inverted condition.
16795 if (Op0.getOpcode() == X86ISD::SETCC) {
16796 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
16797 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
16801 CCode = X86::GetOppositeBranchCondition(CCode);
16802 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
16804 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16808 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
16809 if (isOneConstant(Op1)) {
16810 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
16811 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
16813 if (!isNullConstant(Op1)) {
16814 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
16815 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
16819 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
16820 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
16821 if (X86CC == X86::COND_INVALID)
16824 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
16825 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
16826 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
16828 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
16832 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
16833 SDValue LHS = Op.getOperand(0);
16834 SDValue RHS = Op.getOperand(1);
16835 SDValue Carry = Op.getOperand(2);
16836 SDValue Cond = Op.getOperand(3);
16839 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
16840 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
16842 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
16843 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
16844 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
16845 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
16846 if (Op.getSimpleValueType() == MVT::i1)
16847 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
16851 /// Return true if opcode is a X86 logical comparison.
16852 static bool isX86LogicalCmp(SDValue Op) {
16853 unsigned Opc = Op.getOpcode();
16854 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
16855 Opc == X86ISD::SAHF)
16857 if (Op.getResNo() == 1 &&
16858 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
16859 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
16860 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
16861 Opc == X86ISD::XOR || Opc == X86ISD::AND))
16864 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
16870 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
16871 if (V.getOpcode() != ISD::TRUNCATE)
16874 SDValue VOp0 = V.getOperand(0);
16875 unsigned InBits = VOp0.getValueSizeInBits();
16876 unsigned Bits = V.getValueSizeInBits();
16877 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
16880 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
16881 bool AddTest = true;
16882 SDValue Cond = Op.getOperand(0);
16883 SDValue Op1 = Op.getOperand(1);
16884 SDValue Op2 = Op.getOperand(2);
16886 MVT VT = Op1.getSimpleValueType();
16889 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
16890 // are available or VBLENDV if AVX is available.
16891 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
16892 if (Cond.getOpcode() == ISD::SETCC &&
16893 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
16894 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
16895 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
16896 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
16897 int SSECC = translateX86FSETCC(
16898 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
16901 if (Subtarget.hasAVX512()) {
16902 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
16903 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
16904 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
16905 DL, VT, Cmp, Op1, Op2);
16908 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
16909 DAG.getConstant(SSECC, DL, MVT::i8));
16911 // If we have AVX, we can use a variable vector select (VBLENDV) instead
16912 // of 3 logic instructions for size savings and potentially speed.
16913 // Unfortunately, there is no scalar form of VBLENDV.
16915 // If either operand is a constant, don't try this. We can expect to
16916 // optimize away at least one of the logic instructions later in that
16917 // case, so that sequence would be faster than a variable blend.
16919 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
16920 // uses XMM0 as the selection register. That may need just as many
16921 // instructions as the AND/ANDN/OR sequence due to register moves, so
16924 if (Subtarget.hasAVX() &&
16925 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
16927 // Convert to vectors, do a VSELECT, and convert back to scalar.
16928 // All of the conversions should be optimized away.
16930 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
16931 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
16932 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
16933 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
16935 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
16936 VCmp = DAG.getBitcast(VCmpVT, VCmp);
16938 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
16940 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
16941 VSel, DAG.getIntPtrConstant(0, DL));
16943 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
16944 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
16945 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
16949 // AVX512 fallback is to lower selects of scalar floats to masked moves.
16950 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
16951 Subtarget.hasAVX512())
16952 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
16954 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
16956 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
16957 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
16958 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
16959 Op1Scalar = Op1.getOperand(0);
16961 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
16962 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
16963 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
16964 Op2Scalar = Op2.getOperand(0);
16965 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
16966 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
16967 Op1Scalar.getValueType(),
16968 Cond, Op1Scalar, Op2Scalar);
16969 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
16970 return DAG.getBitcast(VT, newSelect);
16971 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
16972 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
16973 DAG.getIntPtrConstant(0, DL));
16977 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
16978 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
16979 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
16980 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
16981 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
16982 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
16983 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
16985 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
16988 if (Cond.getOpcode() == ISD::SETCC) {
16989 if (SDValue NewCond = LowerSETCC(Cond, DAG))
16993 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
16994 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
16995 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
16996 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
16997 if (Cond.getOpcode() == X86ISD::SETCC &&
16998 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
16999 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17000 SDValue Cmp = Cond.getOperand(1);
17002 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17004 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17005 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17006 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17008 SDValue CmpOp0 = Cmp.getOperand(0);
17009 // Apply further optimizations for special cases
17010 // (select (x != 0), -1, 0) -> neg & sbb
17011 // (select (x == 0), 0, -1) -> neg & sbb
17012 if (isNullConstant(Y) &&
17013 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17014 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17015 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17016 DAG.getConstant(0, DL,
17017 CmpOp0.getValueType()),
17019 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17020 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17021 SDValue(Neg.getNode(), 1));
17025 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17026 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17027 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17029 SDValue Res = // Res = 0 or -1.
17030 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17031 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17033 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17034 Res = DAG.getNOT(DL, Res, Res.getValueType());
17036 if (!isNullConstant(Op2))
17037 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17042 // Look past (and (setcc_carry (cmp ...)), 1).
17043 if (Cond.getOpcode() == ISD::AND &&
17044 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17045 isOneConstant(Cond.getOperand(1)))
17046 Cond = Cond.getOperand(0);
17048 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17049 // setting operand in place of the X86ISD::SETCC.
17050 unsigned CondOpcode = Cond.getOpcode();
17051 if (CondOpcode == X86ISD::SETCC ||
17052 CondOpcode == X86ISD::SETCC_CARRY) {
17053 CC = Cond.getOperand(0);
17055 SDValue Cmp = Cond.getOperand(1);
17056 unsigned Opc = Cmp.getOpcode();
17057 MVT VT = Op.getSimpleValueType();
17059 bool IllegalFPCMov = false;
17060 if (VT.isFloatingPoint() && !VT.isVector() &&
17061 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17062 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17064 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17065 Opc == X86ISD::BT) { // FIXME
17069 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17070 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17071 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17072 Cond.getOperand(0).getValueType() != MVT::i8)) {
17073 SDValue LHS = Cond.getOperand(0);
17074 SDValue RHS = Cond.getOperand(1);
17075 unsigned X86Opcode;
17078 switch (CondOpcode) {
17079 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17080 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17081 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17082 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17083 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17084 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17085 default: llvm_unreachable("unexpected overflowing operator");
17087 if (CondOpcode == ISD::UMULO)
17088 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17091 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17093 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17095 if (CondOpcode == ISD::UMULO)
17096 Cond = X86Op.getValue(2);
17098 Cond = X86Op.getValue(1);
17100 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17105 // Look past the truncate if the high bits are known zero.
17106 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17107 Cond = Cond.getOperand(0);
17109 // We know the result of AND is compared against zero. Try to match
17111 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17112 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17113 CC = NewSetCC.getOperand(0);
17114 Cond = NewSetCC.getOperand(1);
17121 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17122 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17125 // a < b ? -1 : 0 -> RES = ~setcc_carry
17126 // a < b ? 0 : -1 -> RES = setcc_carry
17127 // a >= b ? -1 : 0 -> RES = setcc_carry
17128 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17129 if (Cond.getOpcode() == X86ISD::SUB) {
17130 Cond = ConvertCmpIfNecessary(Cond, DAG);
17131 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17133 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17134 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17135 (isNullConstant(Op1) || isNullConstant(Op2))) {
17136 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17137 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17139 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17140 return DAG.getNOT(DL, Res, Res.getValueType());
17145 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17146 // widen the cmov and push the truncate through. This avoids introducing a new
17147 // branch during isel and doesn't add any extensions.
17148 if (Op.getValueType() == MVT::i8 &&
17149 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17150 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17151 if (T1.getValueType() == T2.getValueType() &&
17152 // Blacklist CopyFromReg to avoid partial register stalls.
17153 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17154 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17155 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17156 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17160 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17161 // condition is true.
17162 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17163 SDValue Ops[] = { Op2, Op1, CC, Cond };
17164 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17167 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17168 const X86Subtarget &Subtarget,
17169 SelectionDAG &DAG) {
17170 MVT VT = Op->getSimpleValueType(0);
17171 SDValue In = Op->getOperand(0);
17172 MVT InVT = In.getSimpleValueType();
17173 MVT VTElt = VT.getVectorElementType();
17174 MVT InVTElt = InVT.getVectorElementType();
17178 if ((InVTElt == MVT::i1) &&
17179 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
17180 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
17182 ((Subtarget.hasBWI() && VT.is512BitVector() &&
17183 VTElt.getSizeInBits() <= 16)) ||
17185 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
17186 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
17188 ((Subtarget.hasDQI() && VT.is512BitVector() &&
17189 VTElt.getSizeInBits() >= 32))))
17190 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17192 unsigned NumElts = VT.getVectorNumElements();
17194 if (NumElts != 8 && NumElts != 16 && !Subtarget.hasBWI())
17197 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1) {
17198 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17199 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
17200 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17203 assert (InVT.getVectorElementType() == MVT::i1 && "Unexpected vector type");
17204 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17205 SDValue NegOne = DAG.getConstant(
17206 APInt::getAllOnesValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
17207 SDValue Zero = DAG.getConstant(
17208 APInt::getNullValue(ExtVT.getScalarSizeInBits()), dl, ExtVT);
17210 SDValue V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17211 if (VT.is512BitVector())
17213 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17216 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17217 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17218 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17219 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17220 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17221 const X86Subtarget &Subtarget,
17222 SelectionDAG &DAG) {
17223 SDValue In = Op->getOperand(0);
17224 MVT VT = Op->getSimpleValueType(0);
17225 MVT InVT = In.getSimpleValueType();
17226 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17228 MVT SVT = VT.getVectorElementType();
17229 MVT InSVT = InVT.getVectorElementType();
17230 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17232 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17234 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17236 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17237 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17238 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17243 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17244 // For 512-bit vectors, we need 128-bits or 256-bits.
17245 if (VT.getSizeInBits() > 128) {
17246 // Input needs to be at least the same number of elements as output, and
17247 // at least 128-bits.
17248 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17249 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17252 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17253 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17255 // SSE41 targets can use the pmovsx* instructions directly.
17256 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17257 X86ISD::VSEXT : X86ISD::VZEXT;
17258 if (Subtarget.hasSSE41())
17259 return DAG.getNode(ExtOpc, dl, VT, In);
17261 // We should only get here for sign extend.
17262 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17263 "Unexpected opcode!");
17265 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17269 // As SRAI is only available on i16/i32 types, we expand only up to i32
17270 // and handle i64 separately.
17271 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17272 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17273 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17274 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17275 Curr = DAG.getBitcast(CurrVT, Curr);
17278 SDValue SignExt = Curr;
17279 if (CurrVT != InVT) {
17280 unsigned SignExtShift =
17281 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17282 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17283 DAG.getConstant(SignExtShift, dl, MVT::i8));
17289 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17290 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17291 DAG.getConstant(31, dl, MVT::i8));
17292 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17293 return DAG.getBitcast(VT, Ext);
17299 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17300 SelectionDAG &DAG) {
17301 MVT VT = Op->getSimpleValueType(0);
17302 SDValue In = Op->getOperand(0);
17303 MVT InVT = In.getSimpleValueType();
17306 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17307 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17309 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17310 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17311 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17314 if (Subtarget.hasInt256())
17315 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17317 // Optimize vectors in AVX mode
17318 // Sign extend v8i16 to v8i32 and
17321 // Divide input vector into two parts
17322 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17323 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17324 // concat the vectors to original VT
17326 unsigned NumElems = InVT.getVectorNumElements();
17327 SDValue Undef = DAG.getUNDEF(InVT);
17329 SmallVector<int,8> ShufMask1(NumElems, -1);
17330 for (unsigned i = 0; i != NumElems/2; ++i)
17333 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17335 SmallVector<int,8> ShufMask2(NumElems, -1);
17336 for (unsigned i = 0; i != NumElems/2; ++i)
17337 ShufMask2[i] = i + NumElems/2;
17339 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17341 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17342 VT.getVectorNumElements() / 2);
17344 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
17345 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
17347 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17350 // Lower truncating store. We need a special lowering to vXi1 vectors
17351 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17352 SelectionDAG &DAG) {
17353 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17355 EVT MemVT = St->getMemoryVT();
17356 assert(St->isTruncatingStore() && "We only custom truncating store.");
17357 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17358 "Expected truncstore of i1 vector");
17360 SDValue Op = St->getValue();
17361 MVT OpVT = Op.getValueType().getSimpleVT();
17362 unsigned NumElts = OpVT.getVectorNumElements();
17363 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17365 // Truncate and store - everything is legal
17366 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17367 if (MemVT.getSizeInBits() < 8)
17368 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17369 DAG.getUNDEF(MVT::v8i1), Op,
17370 DAG.getIntPtrConstant(0, dl));
17371 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17372 St->getMemOperand());
17375 // A subset, assume that we have only AVX-512F
17376 if (NumElts <= 8) {
17378 // Extend to 8-elts vector
17379 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17380 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17381 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17383 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17384 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17385 St->getMemOperand());
17388 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17389 // Divide the vector into 2 parts and store each part separately
17390 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17391 DAG.getIntPtrConstant(0, dl));
17392 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17393 SDValue BasePtr = St->getBasePtr();
17394 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17395 St->getMemOperand());
17396 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17397 DAG.getIntPtrConstant(16, dl));
17398 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
17400 SDValue BasePtrHi =
17401 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17402 DAG.getConstant(2, dl, BasePtr.getValueType()));
17404 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
17405 BasePtrHi, St->getMemOperand());
17406 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
17409 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
17410 const X86Subtarget &Subtarget,
17411 SelectionDAG &DAG) {
17413 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17415 EVT MemVT = Ld->getMemoryVT();
17416 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
17417 "Expected i1 vector load");
17418 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
17419 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17420 MVT VT = Op.getValueType().getSimpleVT();
17421 unsigned NumElts = VT.getVectorNumElements();
17423 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17425 // Load and extend - everything is legal
17427 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
17429 Ld->getMemOperand());
17430 // Replace chain users with the new chain.
17431 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17432 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17433 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17434 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
17436 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17437 DAG.getIntPtrConstant(0, dl));
17439 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
17441 Ld->getMemOperand());
17442 // Replace chain users with the new chain.
17443 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17444 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17446 // Finally, do a normal sign-extend to the desired register.
17447 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
17450 if (NumElts <= 8) {
17451 // A subset, assume that we have only AVX-512F
17452 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
17453 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
17454 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
17456 Ld->getMemOperand());
17457 // Replace chain users with the new chain.
17458 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17459 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17461 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
17462 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
17465 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
17467 // we should take care to v4i1 and v2i1
17469 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17470 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
17471 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17472 DAG.getIntPtrConstant(0, dl));
17475 assert(VT == MVT::v32i8 && "Unexpected extload type");
17477 SmallVector<SDValue, 2> Chains;
17479 SDValue BasePtr = Ld->getBasePtr();
17480 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17482 Ld->getMemOperand());
17483 Chains.push_back(LoadLo.getValue(1));
17485 SDValue BasePtrHi =
17486 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17487 DAG.getConstant(2, dl, BasePtr.getValueType()));
17489 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17491 Ld->getMemOperand());
17492 Chains.push_back(LoadHi.getValue(1));
17493 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17494 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
17496 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
17497 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
17498 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
17501 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
17502 // may emit an illegal shuffle but the expansion is still better than scalar
17503 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
17504 // we'll emit a shuffle and a arithmetic shift.
17505 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
17506 // TODO: It is possible to support ZExt by zeroing the undef values during
17507 // the shuffle phase or after the shuffle.
17508 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
17509 SelectionDAG &DAG) {
17510 MVT RegVT = Op.getSimpleValueType();
17511 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
17512 assert(RegVT.isInteger() &&
17513 "We only custom lower integer vector sext loads.");
17515 // Nothing useful we can do without SSE2 shuffles.
17516 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
17518 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17520 EVT MemVT = Ld->getMemoryVT();
17521 if (MemVT.getScalarType() == MVT::i1)
17522 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
17524 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17525 unsigned RegSz = RegVT.getSizeInBits();
17527 ISD::LoadExtType Ext = Ld->getExtensionType();
17529 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
17530 && "Only anyext and sext are currently implemented.");
17531 assert(MemVT != RegVT && "Cannot extend to the same type");
17532 assert(MemVT.isVector() && "Must load a vector from memory");
17534 unsigned NumElems = RegVT.getVectorNumElements();
17535 unsigned MemSz = MemVT.getSizeInBits();
17536 assert(RegSz > MemSz && "Register size must be greater than the mem size");
17538 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
17539 // The only way in which we have a legal 256-bit vector result but not the
17540 // integer 256-bit operations needed to directly lower a sextload is if we
17541 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
17542 // a 128-bit vector and a normal sign_extend to 256-bits that should get
17543 // correctly legalized. We do this late to allow the canonical form of
17544 // sextload to persist throughout the rest of the DAG combiner -- it wants
17545 // to fold together any extensions it can, and so will fuse a sign_extend
17546 // of an sextload into a sextload targeting a wider value.
17548 if (MemSz == 128) {
17549 // Just switch this to a normal load.
17550 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
17551 "it must be a legal 128-bit vector "
17553 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
17554 Ld->getPointerInfo(), Ld->getAlignment(),
17555 Ld->getMemOperand()->getFlags());
17557 assert(MemSz < 128 &&
17558 "Can't extend a type wider than 128 bits to a 256 bit vector!");
17559 // Do an sext load to a 128-bit vector type. We want to use the same
17560 // number of elements, but elements half as wide. This will end up being
17561 // recursively lowered by this routine, but will succeed as we definitely
17562 // have all the necessary features if we're using AVX1.
17564 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
17565 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
17567 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
17568 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
17569 Ld->getMemOperand()->getFlags());
17572 // Replace chain users with the new chain.
17573 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17574 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17576 // Finally, do a normal sign-extend to the desired register.
17577 return DAG.getSExtOrTrunc(Load, dl, RegVT);
17580 // All sizes must be a power of two.
17581 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
17582 "Non-power-of-two elements are not custom lowered!");
17584 // Attempt to load the original value using scalar loads.
17585 // Find the largest scalar type that divides the total loaded size.
17586 MVT SclrLoadTy = MVT::i8;
17587 for (MVT Tp : MVT::integer_valuetypes()) {
17588 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
17593 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
17594 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
17596 SclrLoadTy = MVT::f64;
17598 // Calculate the number of scalar loads that we need to perform
17599 // in order to load our vector from memory.
17600 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
17602 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
17603 "Can only lower sext loads with a single scalar load!");
17605 unsigned loadRegZize = RegSz;
17606 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
17609 // Represent our vector as a sequence of elements which are the
17610 // largest scalar that we can load.
17611 EVT LoadUnitVecVT = EVT::getVectorVT(
17612 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
17614 // Represent the data using the same element type that is stored in
17615 // memory. In practice, we ''widen'' MemVT.
17617 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
17618 loadRegZize / MemVT.getScalarSizeInBits());
17620 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
17621 "Invalid vector type");
17623 // We can't shuffle using an illegal type.
17624 assert(TLI.isTypeLegal(WideVecVT) &&
17625 "We only lower types that form legal widened vector types");
17627 SmallVector<SDValue, 8> Chains;
17628 SDValue Ptr = Ld->getBasePtr();
17629 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
17630 TLI.getPointerTy(DAG.getDataLayout()));
17631 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
17633 for (unsigned i = 0; i < NumLoads; ++i) {
17634 // Perform a single load.
17635 SDValue ScalarLoad =
17636 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
17637 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
17638 Chains.push_back(ScalarLoad.getValue(1));
17639 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
17640 // another round of DAGCombining.
17642 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
17644 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
17645 ScalarLoad, DAG.getIntPtrConstant(i, dl));
17647 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17650 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17652 // Bitcast the loaded value to a vector of the original element type, in
17653 // the size of the target vector type.
17654 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
17655 unsigned SizeRatio = RegSz / MemSz;
17657 if (Ext == ISD::SEXTLOAD) {
17658 // If we have SSE4.1, we can directly emit a VSEXT node.
17659 if (Subtarget.hasSSE41()) {
17660 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
17661 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17665 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
17667 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
17668 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
17670 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
17671 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17675 // Redistribute the loaded elements into the different locations.
17676 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
17677 for (unsigned i = 0; i != NumElems; ++i)
17678 ShuffleVec[i * SizeRatio] = i;
17680 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17681 DAG.getUNDEF(WideVecVT), ShuffleVec);
17683 // Bitcast to the requested type.
17684 Shuff = DAG.getBitcast(RegVT, Shuff);
17685 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17689 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
17690 /// each of which has no other use apart from the AND / OR.
17691 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
17692 Opc = Op.getOpcode();
17693 if (Opc != ISD::OR && Opc != ISD::AND)
17695 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17696 Op.getOperand(0).hasOneUse() &&
17697 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
17698 Op.getOperand(1).hasOneUse());
17701 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
17702 /// SETCC node has a single use.
17703 static bool isXor1OfSetCC(SDValue Op) {
17704 if (Op.getOpcode() != ISD::XOR)
17706 if (isOneConstant(Op.getOperand(1)))
17707 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17708 Op.getOperand(0).hasOneUse();
17712 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
17713 bool addTest = true;
17714 SDValue Chain = Op.getOperand(0);
17715 SDValue Cond = Op.getOperand(1);
17716 SDValue Dest = Op.getOperand(2);
17719 bool Inverted = false;
17721 if (Cond.getOpcode() == ISD::SETCC) {
17722 // Check for setcc([su]{add,sub,mul}o == 0).
17723 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
17724 isNullConstant(Cond.getOperand(1)) &&
17725 Cond.getOperand(0).getResNo() == 1 &&
17726 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
17727 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
17728 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
17729 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
17730 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
17731 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
17733 Cond = Cond.getOperand(0);
17735 if (SDValue NewCond = LowerSETCC(Cond, DAG))
17740 // FIXME: LowerXALUO doesn't handle these!!
17741 else if (Cond.getOpcode() == X86ISD::ADD ||
17742 Cond.getOpcode() == X86ISD::SUB ||
17743 Cond.getOpcode() == X86ISD::SMUL ||
17744 Cond.getOpcode() == X86ISD::UMUL)
17745 Cond = LowerXALUO(Cond, DAG);
17748 // Look pass (and (setcc_carry (cmp ...)), 1).
17749 if (Cond.getOpcode() == ISD::AND &&
17750 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17751 isOneConstant(Cond.getOperand(1)))
17752 Cond = Cond.getOperand(0);
17754 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17755 // setting operand in place of the X86ISD::SETCC.
17756 unsigned CondOpcode = Cond.getOpcode();
17757 if (CondOpcode == X86ISD::SETCC ||
17758 CondOpcode == X86ISD::SETCC_CARRY) {
17759 CC = Cond.getOperand(0);
17761 SDValue Cmp = Cond.getOperand(1);
17762 unsigned Opc = Cmp.getOpcode();
17763 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
17764 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
17768 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
17772 // These can only come from an arithmetic instruction with overflow,
17773 // e.g. SADDO, UADDO.
17774 Cond = Cond.getOperand(1);
17780 CondOpcode = Cond.getOpcode();
17781 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17782 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17783 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17784 Cond.getOperand(0).getValueType() != MVT::i8)) {
17785 SDValue LHS = Cond.getOperand(0);
17786 SDValue RHS = Cond.getOperand(1);
17787 unsigned X86Opcode;
17790 // Keep this in sync with LowerXALUO, otherwise we might create redundant
17791 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
17793 switch (CondOpcode) {
17794 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17796 if (isOneConstant(RHS)) {
17797 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
17800 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17801 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17803 if (isOneConstant(RHS)) {
17804 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
17807 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17808 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17809 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17810 default: llvm_unreachable("unexpected overflowing operator");
17813 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
17814 if (CondOpcode == ISD::UMULO)
17815 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17818 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17820 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
17822 if (CondOpcode == ISD::UMULO)
17823 Cond = X86Op.getValue(2);
17825 Cond = X86Op.getValue(1);
17827 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17831 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
17832 SDValue Cmp = Cond.getOperand(0).getOperand(1);
17833 if (CondOpc == ISD::OR) {
17834 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
17835 // two branches instead of an explicit OR instruction with a
17837 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17838 isX86LogicalCmp(Cmp)) {
17839 CC = Cond.getOperand(0).getOperand(0);
17840 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17841 Chain, Dest, CC, Cmp);
17842 CC = Cond.getOperand(1).getOperand(0);
17846 } else { // ISD::AND
17847 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
17848 // two branches instead of an explicit AND instruction with a
17849 // separate test. However, we only do this if this block doesn't
17850 // have a fall-through edge, because this requires an explicit
17851 // jmp when the condition is false.
17852 if (Cmp == Cond.getOperand(1).getOperand(1) &&
17853 isX86LogicalCmp(Cmp) &&
17854 Op.getNode()->hasOneUse()) {
17855 X86::CondCode CCode =
17856 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17857 CCode = X86::GetOppositeBranchCondition(CCode);
17858 CC = DAG.getConstant(CCode, dl, MVT::i8);
17859 SDNode *User = *Op.getNode()->use_begin();
17860 // Look for an unconditional branch following this conditional branch.
17861 // We need this because we need to reverse the successors in order
17862 // to implement FCMP_OEQ.
17863 if (User->getOpcode() == ISD::BR) {
17864 SDValue FalseBB = User->getOperand(1);
17866 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17867 assert(NewBR == User);
17871 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17872 Chain, Dest, CC, Cmp);
17873 X86::CondCode CCode =
17874 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
17875 CCode = X86::GetOppositeBranchCondition(CCode);
17876 CC = DAG.getConstant(CCode, dl, MVT::i8);
17882 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
17883 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
17884 // It should be transformed during dag combiner except when the condition
17885 // is set by a arithmetics with overflow node.
17886 X86::CondCode CCode =
17887 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
17888 CCode = X86::GetOppositeBranchCondition(CCode);
17889 CC = DAG.getConstant(CCode, dl, MVT::i8);
17890 Cond = Cond.getOperand(0).getOperand(1);
17892 } else if (Cond.getOpcode() == ISD::SETCC &&
17893 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
17894 // For FCMP_OEQ, we can emit
17895 // two branches instead of an explicit AND instruction with a
17896 // separate test. However, we only do this if this block doesn't
17897 // have a fall-through edge, because this requires an explicit
17898 // jmp when the condition is false.
17899 if (Op.getNode()->hasOneUse()) {
17900 SDNode *User = *Op.getNode()->use_begin();
17901 // Look for an unconditional branch following this conditional branch.
17902 // We need this because we need to reverse the successors in order
17903 // to implement FCMP_OEQ.
17904 if (User->getOpcode() == ISD::BR) {
17905 SDValue FalseBB = User->getOperand(1);
17907 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17908 assert(NewBR == User);
17912 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17913 Cond.getOperand(0), Cond.getOperand(1));
17914 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17915 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
17916 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17917 Chain, Dest, CC, Cmp);
17918 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
17923 } else if (Cond.getOpcode() == ISD::SETCC &&
17924 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
17925 // For FCMP_UNE, we can emit
17926 // two branches instead of an explicit AND instruction with a
17927 // separate test. However, we only do this if this block doesn't
17928 // have a fall-through edge, because this requires an explicit
17929 // jmp when the condition is false.
17930 if (Op.getNode()->hasOneUse()) {
17931 SDNode *User = *Op.getNode()->use_begin();
17932 // Look for an unconditional branch following this conditional branch.
17933 // We need this because we need to reverse the successors in order
17934 // to implement FCMP_UNE.
17935 if (User->getOpcode() == ISD::BR) {
17936 SDValue FalseBB = User->getOperand(1);
17938 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
17939 assert(NewBR == User);
17942 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
17943 Cond.getOperand(0), Cond.getOperand(1));
17944 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17945 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
17946 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17947 Chain, Dest, CC, Cmp);
17948 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
17958 // Look pass the truncate if the high bits are known zero.
17959 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17960 Cond = Cond.getOperand(0);
17962 // We know the result is compared against zero. Try to match it to BT.
17963 if (Cond.hasOneUse()) {
17964 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
17965 CC = NewSetCC.getOperand(0);
17966 Cond = NewSetCC.getOperand(1);
17973 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
17974 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
17975 Cond = EmitTest(Cond, X86Cond, dl, DAG);
17977 Cond = ConvertCmpIfNecessary(Cond, DAG);
17978 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
17979 Chain, Dest, CC, Cond);
17982 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
17983 // Calls to _alloca are needed to probe the stack when allocating more than 4k
17984 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
17985 // that the guard pages used by the OS virtual memory manager are allocated in
17986 // correct sequence.
17988 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
17989 SelectionDAG &DAG) const {
17990 MachineFunction &MF = DAG.getMachineFunction();
17991 bool SplitStack = MF.shouldSplitStack();
17992 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
17997 SDNode *Node = Op.getNode();
17998 SDValue Chain = Op.getOperand(0);
17999 SDValue Size = Op.getOperand(1);
18000 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18001 EVT VT = Node->getValueType(0);
18003 // Chain the dynamic stack allocation so that it doesn't modify the stack
18004 // pointer when other instructions are using the stack.
18005 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18007 bool Is64Bit = Subtarget.is64Bit();
18008 MVT SPTy = getPointerTy(DAG.getDataLayout());
18012 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18013 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18014 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18015 " not tell us which reg is the stack pointer!");
18017 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18018 Chain = SP.getValue(1);
18019 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18020 unsigned StackAlign = TFI.getStackAlignment();
18021 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18022 if (Align > StackAlign)
18023 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18024 DAG.getConstant(-(uint64_t)Align, dl, VT));
18025 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18026 } else if (SplitStack) {
18027 MachineRegisterInfo &MRI = MF.getRegInfo();
18030 // The 64 bit implementation of segmented stacks needs to clobber both r10
18031 // r11. This makes it impossible to use it along with nested parameters.
18032 const Function *F = MF.getFunction();
18033 for (const auto &A : F->args()) {
18034 if (A.hasNestAttr())
18035 report_fatal_error("Cannot use segmented stacks with functions that "
18036 "have nested arguments.");
18040 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18041 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18042 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18043 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18044 DAG.getRegister(Vreg, SPTy));
18046 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18047 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18048 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18050 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18051 unsigned SPReg = RegInfo->getStackRegister();
18052 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18053 Chain = SP.getValue(1);
18056 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18057 DAG.getConstant(-(uint64_t)Align, dl, VT));
18058 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18064 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18065 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18067 SDValue Ops[2] = {Result, Chain};
18068 return DAG.getMergeValues(Ops, dl);
18071 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18072 MachineFunction &MF = DAG.getMachineFunction();
18073 auto PtrVT = getPointerTy(MF.getDataLayout());
18074 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18076 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18079 if (!Subtarget.is64Bit() ||
18080 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18081 // vastart just stores the address of the VarArgsFrameIndex slot into the
18082 // memory location argument.
18083 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18084 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18085 MachinePointerInfo(SV));
18089 // gp_offset (0 - 6 * 8)
18090 // fp_offset (48 - 48 + 8 * 16)
18091 // overflow_arg_area (point to parameters coming in memory).
18093 SmallVector<SDValue, 8> MemOps;
18094 SDValue FIN = Op.getOperand(1);
18096 SDValue Store = DAG.getStore(
18097 Op.getOperand(0), DL,
18098 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18099 MachinePointerInfo(SV));
18100 MemOps.push_back(Store);
18103 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18104 Store = DAG.getStore(
18105 Op.getOperand(0), DL,
18106 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18107 MachinePointerInfo(SV, 4));
18108 MemOps.push_back(Store);
18110 // Store ptr to overflow_arg_area
18111 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18112 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18114 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18115 MemOps.push_back(Store);
18117 // Store ptr to reg_save_area.
18118 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18119 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18120 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18121 Store = DAG.getStore(
18122 Op.getOperand(0), DL, RSFIN, FIN,
18123 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18124 MemOps.push_back(Store);
18125 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18128 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18129 assert(Subtarget.is64Bit() &&
18130 "LowerVAARG only handles 64-bit va_arg!");
18131 assert(Op.getNumOperands() == 4);
18133 MachineFunction &MF = DAG.getMachineFunction();
18134 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18135 // The Win64 ABI uses char* instead of a structure.
18136 return DAG.expandVAArg(Op.getNode());
18138 SDValue Chain = Op.getOperand(0);
18139 SDValue SrcPtr = Op.getOperand(1);
18140 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18141 unsigned Align = Op.getConstantOperandVal(3);
18144 EVT ArgVT = Op.getNode()->getValueType(0);
18145 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18146 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18149 // Decide which area this value should be read from.
18150 // TODO: Implement the AMD64 ABI in its entirety. This simple
18151 // selection mechanism works only for the basic types.
18152 if (ArgVT == MVT::f80) {
18153 llvm_unreachable("va_arg for f80 not yet implemented");
18154 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18155 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18156 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18157 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18159 llvm_unreachable("Unhandled argument type in LowerVAARG");
18162 if (ArgMode == 2) {
18163 // Sanity Check: Make sure using fp_offset makes sense.
18164 assert(!Subtarget.useSoftFloat() &&
18165 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18166 Subtarget.hasSSE1());
18169 // Insert VAARG_64 node into the DAG
18170 // VAARG_64 returns two values: Variable Argument Address, Chain
18171 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18172 DAG.getConstant(ArgMode, dl, MVT::i8),
18173 DAG.getConstant(Align, dl, MVT::i32)};
18174 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18175 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18176 VTs, InstOps, MVT::i64,
18177 MachinePointerInfo(SV),
18179 /*Volatile=*/false,
18181 /*WriteMem=*/true);
18182 Chain = VAARG.getValue(1);
18184 // Load the next argument and return it
18185 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18188 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18189 SelectionDAG &DAG) {
18190 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18191 // where a va_list is still an i8*.
18192 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18193 if (Subtarget.isCallingConvWin64(
18194 DAG.getMachineFunction().getFunction()->getCallingConv()))
18195 // Probably a Win64 va_copy.
18196 return DAG.expandVACopy(Op.getNode());
18198 SDValue Chain = Op.getOperand(0);
18199 SDValue DstPtr = Op.getOperand(1);
18200 SDValue SrcPtr = Op.getOperand(2);
18201 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18202 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18205 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18206 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18208 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18211 /// Handle vector element shifts where the shift amount is a constant.
18212 /// Takes immediate version of shift as input.
18213 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18214 SDValue SrcOp, uint64_t ShiftAmt,
18215 SelectionDAG &DAG) {
18216 MVT ElementType = VT.getVectorElementType();
18218 // Fold this packed shift into its first operand if ShiftAmt is 0.
18222 // Check for ShiftAmt >= element width
18223 if (ShiftAmt >= ElementType.getSizeInBits()) {
18224 if (Opc == X86ISD::VSRAI)
18225 ShiftAmt = ElementType.getSizeInBits() - 1;
18227 return DAG.getConstant(0, dl, VT);
18230 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18231 && "Unknown target vector shift-by-constant node");
18233 // Fold this packed vector shift into a build vector if SrcOp is a
18234 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
18235 if (VT == SrcOp.getSimpleValueType() &&
18236 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18237 SmallVector<SDValue, 8> Elts;
18238 unsigned NumElts = SrcOp->getNumOperands();
18239 ConstantSDNode *ND;
18242 default: llvm_unreachable("Unknown opcode!");
18243 case X86ISD::VSHLI:
18244 for (unsigned i=0; i!=NumElts; ++i) {
18245 SDValue CurrentOp = SrcOp->getOperand(i);
18246 if (CurrentOp->isUndef()) {
18247 Elts.push_back(CurrentOp);
18250 ND = cast<ConstantSDNode>(CurrentOp);
18251 const APInt &C = ND->getAPIntValue();
18252 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18255 case X86ISD::VSRLI:
18256 for (unsigned i=0; i!=NumElts; ++i) {
18257 SDValue CurrentOp = SrcOp->getOperand(i);
18258 if (CurrentOp->isUndef()) {
18259 Elts.push_back(CurrentOp);
18262 ND = cast<ConstantSDNode>(CurrentOp);
18263 const APInt &C = ND->getAPIntValue();
18264 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18267 case X86ISD::VSRAI:
18268 for (unsigned i=0; i!=NumElts; ++i) {
18269 SDValue CurrentOp = SrcOp->getOperand(i);
18270 if (CurrentOp->isUndef()) {
18271 Elts.push_back(CurrentOp);
18274 ND = cast<ConstantSDNode>(CurrentOp);
18275 const APInt &C = ND->getAPIntValue();
18276 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18281 return DAG.getBuildVector(VT, dl, Elts);
18284 return DAG.getNode(Opc, dl, VT, SrcOp,
18285 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18288 /// Handle vector element shifts where the shift amount may or may not be a
18289 /// constant. Takes immediate version of shift as input.
18290 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18291 SDValue SrcOp, SDValue ShAmt,
18292 SelectionDAG &DAG) {
18293 MVT SVT = ShAmt.getSimpleValueType();
18294 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18296 // Catch shift-by-constant.
18297 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18298 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18299 CShAmt->getZExtValue(), DAG);
18301 // Change opcode to non-immediate version
18303 default: llvm_unreachable("Unknown target vector shift node");
18304 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18305 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18306 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18309 const X86Subtarget &Subtarget =
18310 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18311 if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18312 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18313 // Let the shuffle legalizer expand this shift amount node.
18314 SDValue Op0 = ShAmt.getOperand(0);
18315 Op0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(Op0), MVT::v8i16, Op0);
18316 ShAmt = getShuffleVectorZeroOrUndef(Op0, 0, true, Subtarget, DAG);
18318 // Need to build a vector containing shift amount.
18319 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18320 SmallVector<SDValue, 4> ShOps;
18321 ShOps.push_back(ShAmt);
18322 if (SVT == MVT::i32) {
18323 ShOps.push_back(DAG.getConstant(0, dl, SVT));
18324 ShOps.push_back(DAG.getUNDEF(SVT));
18326 ShOps.push_back(DAG.getUNDEF(SVT));
18328 MVT BVT = SVT == MVT::i32 ? MVT::v4i32 : MVT::v2i64;
18329 ShAmt = DAG.getBuildVector(BVT, dl, ShOps);
18332 // The return type has to be a 128-bit type with the same element
18333 // type as the input type.
18334 MVT EltVT = VT.getVectorElementType();
18335 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18337 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18338 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18341 /// \brief Return Mask with the necessary casting or extending
18342 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18343 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18344 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18347 if (isAllOnesConstant(Mask))
18348 return DAG.getTargetConstant(1, dl, MaskVT);
18349 if (X86::isZeroNode(Mask))
18350 return DAG.getTargetConstant(0, dl, MaskVT);
18352 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18353 // Mask should be extended
18354 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18355 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18358 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18359 if (MaskVT == MVT::v64i1) {
18360 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18361 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18363 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18364 DAG.getConstant(0, dl, MVT::i32));
18365 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18366 DAG.getConstant(1, dl, MVT::i32));
18368 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18369 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18371 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18373 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18375 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18376 return DAG.getBitcast(MaskVT,
18377 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18381 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18382 Mask.getSimpleValueType().getSizeInBits());
18383 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18384 // are extracted by EXTRACT_SUBVECTOR.
18385 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18386 DAG.getBitcast(BitcastVT, Mask),
18387 DAG.getIntPtrConstant(0, dl));
18391 /// \brief Return (and \p Op, \p Mask) for compare instructions or
18392 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
18393 /// necessary casting or extending for \p Mask when lowering masking intrinsics
18394 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
18395 SDValue PreservedSrc,
18396 const X86Subtarget &Subtarget,
18397 SelectionDAG &DAG) {
18398 MVT VT = Op.getSimpleValueType();
18399 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18400 unsigned OpcodeSelect = ISD::VSELECT;
18403 if (isAllOnesConstant(Mask))
18406 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18408 switch (Op.getOpcode()) {
18410 case X86ISD::PCMPEQM:
18411 case X86ISD::PCMPGTM:
18413 case X86ISD::CMPMU:
18414 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
18415 case X86ISD::VFPCLASS:
18416 case X86ISD::VFPCLASSS:
18417 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
18418 case X86ISD::VTRUNC:
18419 case X86ISD::VTRUNCS:
18420 case X86ISD::VTRUNCUS:
18421 case X86ISD::CVTPS2PH:
18422 // We can't use ISD::VSELECT here because it is not always "Legal"
18423 // for the destination type. For example vpmovqb require only AVX512
18424 // and vselect that can operate on byte element type require BWI
18425 OpcodeSelect = X86ISD::SELECT;
18428 if (PreservedSrc.isUndef())
18429 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18430 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
18433 /// \brief Creates an SDNode for a predicated scalar operation.
18434 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
18435 /// The mask is coming as MVT::i8 and it should be truncated
18436 /// to MVT::i1 while lowering masking intrinsics.
18437 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
18438 /// "X86select" instead of "vselect". We just can't create the "vselect" node
18439 /// for a scalar instruction.
18440 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
18441 SDValue PreservedSrc,
18442 const X86Subtarget &Subtarget,
18443 SelectionDAG &DAG) {
18444 if (isAllOnesConstant(Mask))
18447 MVT VT = Op.getSimpleValueType();
18449 // The mask should be of type MVT::i1
18450 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
18452 if (Op.getOpcode() == X86ISD::FSETCCM ||
18453 Op.getOpcode() == X86ISD::FSETCCM_RND)
18454 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
18455 if (Op.getOpcode() == X86ISD::VFPCLASS ||
18456 Op.getOpcode() == X86ISD::VFPCLASSS)
18457 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
18459 if (PreservedSrc.isUndef())
18460 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18461 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
18464 static int getSEHRegistrationNodeSize(const Function *Fn) {
18465 if (!Fn->hasPersonalityFn())
18466 report_fatal_error(
18467 "querying registration node size for function without personality");
18468 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
18469 // WinEHStatePass for the full struct definition.
18470 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
18471 case EHPersonality::MSVC_X86SEH: return 24;
18472 case EHPersonality::MSVC_CXX: return 16;
18475 report_fatal_error(
18476 "can only recover FP for 32-bit MSVC EH personality functions");
18479 /// When the MSVC runtime transfers control to us, either to an outlined
18480 /// function or when returning to a parent frame after catching an exception, we
18481 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
18482 /// Here's the math:
18483 /// RegNodeBase = EntryEBP - RegNodeSize
18484 /// ParentFP = RegNodeBase - ParentFrameOffset
18485 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
18486 /// subtracting the offset (negative on x86) takes us back to the parent FP.
18487 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
18488 SDValue EntryEBP) {
18489 MachineFunction &MF = DAG.getMachineFunction();
18492 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18493 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18495 // It's possible that the parent function no longer has a personality function
18496 // if the exceptional code was optimized away, in which case we just return
18497 // the incoming EBP.
18498 if (!Fn->hasPersonalityFn())
18501 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
18502 // registration, or the .set_setframe offset.
18503 MCSymbol *OffsetSym =
18504 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
18505 GlobalValue::getRealLinkageName(Fn->getName()));
18506 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
18507 SDValue ParentFrameOffset =
18508 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
18510 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
18511 // prologue to RBP in the parent function.
18512 const X86Subtarget &Subtarget =
18513 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18514 if (Subtarget.is64Bit())
18515 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
18517 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
18518 // RegNodeBase = EntryEBP - RegNodeSize
18519 // ParentFP = RegNodeBase - ParentFrameOffset
18520 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
18521 DAG.getConstant(RegNodeSize, dl, PtrVT));
18522 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
18525 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18526 SelectionDAG &DAG) {
18527 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
18528 auto isRoundModeCurDirection = [](SDValue Rnd) {
18529 if (!isa<ConstantSDNode>(Rnd))
18532 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
18533 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
18537 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18538 MVT VT = Op.getSimpleValueType();
18539 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
18541 switch(IntrData->Type) {
18542 case INTR_TYPE_1OP:
18543 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
18544 case INTR_TYPE_2OP:
18545 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18547 case INTR_TYPE_3OP:
18548 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18549 Op.getOperand(2), Op.getOperand(3));
18550 case INTR_TYPE_4OP:
18551 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18552 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
18553 case INTR_TYPE_1OP_MASK_RM: {
18554 SDValue Src = Op.getOperand(1);
18555 SDValue PassThru = Op.getOperand(2);
18556 SDValue Mask = Op.getOperand(3);
18557 SDValue RoundingMode;
18558 // We always add rounding mode to the Node.
18559 // If the rounding mode is not specified, we add the
18560 // "current direction" mode.
18561 if (Op.getNumOperands() == 4)
18563 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18565 RoundingMode = Op.getOperand(4);
18566 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
18567 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18569 Mask, PassThru, Subtarget, DAG);
18571 case INTR_TYPE_1OP_MASK: {
18572 SDValue Src = Op.getOperand(1);
18573 SDValue PassThru = Op.getOperand(2);
18574 SDValue Mask = Op.getOperand(3);
18575 // We add rounding mode to the Node when
18576 // - RM Opcode is specified and
18577 // - RM is not "current direction".
18578 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18579 if (IntrWithRoundingModeOpcode != 0) {
18580 SDValue Rnd = Op.getOperand(4);
18581 if (!isRoundModeCurDirection(Rnd)) {
18582 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18583 dl, Op.getValueType(),
18585 Mask, PassThru, Subtarget, DAG);
18588 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
18589 Mask, PassThru, Subtarget, DAG);
18591 case INTR_TYPE_SCALAR_MASK: {
18592 SDValue Src1 = Op.getOperand(1);
18593 SDValue Src2 = Op.getOperand(2);
18594 SDValue passThru = Op.getOperand(3);
18595 SDValue Mask = Op.getOperand(4);
18596 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
18597 Mask, passThru, Subtarget, DAG);
18599 case INTR_TYPE_SCALAR_MASK_RM: {
18600 SDValue Src1 = Op.getOperand(1);
18601 SDValue Src2 = Op.getOperand(2);
18602 SDValue Src0 = Op.getOperand(3);
18603 SDValue Mask = Op.getOperand(4);
18604 // There are 2 kinds of intrinsics in this group:
18605 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
18606 // (2) With rounding mode and sae - 7 operands.
18607 if (Op.getNumOperands() == 6) {
18608 SDValue Sae = Op.getOperand(5);
18609 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18611 Mask, Src0, Subtarget, DAG);
18613 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
18614 SDValue RoundingMode = Op.getOperand(5);
18615 SDValue Sae = Op.getOperand(6);
18616 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18617 RoundingMode, Sae),
18618 Mask, Src0, Subtarget, DAG);
18620 case INTR_TYPE_2OP_MASK:
18621 case INTR_TYPE_2OP_IMM8_MASK: {
18622 SDValue Src1 = Op.getOperand(1);
18623 SDValue Src2 = Op.getOperand(2);
18624 SDValue PassThru = Op.getOperand(3);
18625 SDValue Mask = Op.getOperand(4);
18627 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
18628 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
18630 // We specify 2 possible opcodes for intrinsics with rounding modes.
18631 // First, we check if the intrinsic may have non-default rounding mode,
18632 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18633 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18634 if (IntrWithRoundingModeOpcode != 0) {
18635 SDValue Rnd = Op.getOperand(5);
18636 if (!isRoundModeCurDirection(Rnd)) {
18637 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18638 dl, Op.getValueType(),
18640 Mask, PassThru, Subtarget, DAG);
18643 // TODO: Intrinsics should have fast-math-flags to propagate.
18644 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
18645 Mask, PassThru, Subtarget, DAG);
18647 case INTR_TYPE_2OP_MASK_RM: {
18648 SDValue Src1 = Op.getOperand(1);
18649 SDValue Src2 = Op.getOperand(2);
18650 SDValue PassThru = Op.getOperand(3);
18651 SDValue Mask = Op.getOperand(4);
18652 // We specify 2 possible modes for intrinsics, with/without rounding
18654 // First, we check if the intrinsic have rounding mode (6 operands),
18655 // if not, we set rounding mode to "current".
18657 if (Op.getNumOperands() == 6)
18658 Rnd = Op.getOperand(5);
18660 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18661 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18663 Mask, PassThru, Subtarget, DAG);
18665 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
18666 SDValue Src1 = Op.getOperand(1);
18667 SDValue Src2 = Op.getOperand(2);
18668 SDValue Src3 = Op.getOperand(3);
18669 SDValue PassThru = Op.getOperand(4);
18670 SDValue Mask = Op.getOperand(5);
18671 SDValue Sae = Op.getOperand(6);
18673 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
18675 Mask, PassThru, Subtarget, DAG);
18677 case INTR_TYPE_3OP_MASK_RM: {
18678 SDValue Src1 = Op.getOperand(1);
18679 SDValue Src2 = Op.getOperand(2);
18680 SDValue Imm = Op.getOperand(3);
18681 SDValue PassThru = Op.getOperand(4);
18682 SDValue Mask = Op.getOperand(5);
18683 // We specify 2 possible modes for intrinsics, with/without rounding
18685 // First, we check if the intrinsic have rounding mode (7 operands),
18686 // if not, we set rounding mode to "current".
18688 if (Op.getNumOperands() == 7)
18689 Rnd = Op.getOperand(6);
18691 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18692 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18693 Src1, Src2, Imm, Rnd),
18694 Mask, PassThru, Subtarget, DAG);
18696 case INTR_TYPE_3OP_IMM8_MASK:
18697 case INTR_TYPE_3OP_MASK: {
18698 SDValue Src1 = Op.getOperand(1);
18699 SDValue Src2 = Op.getOperand(2);
18700 SDValue Src3 = Op.getOperand(3);
18701 SDValue PassThru = Op.getOperand(4);
18702 SDValue Mask = Op.getOperand(5);
18704 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
18705 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
18707 // We specify 2 possible opcodes for intrinsics with rounding modes.
18708 // First, we check if the intrinsic may have non-default rounding mode,
18709 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18710 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18711 if (IntrWithRoundingModeOpcode != 0) {
18712 SDValue Rnd = Op.getOperand(6);
18713 if (!isRoundModeCurDirection(Rnd)) {
18714 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18715 dl, Op.getValueType(),
18716 Src1, Src2, Src3, Rnd),
18717 Mask, PassThru, Subtarget, DAG);
18720 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18722 Mask, PassThru, Subtarget, DAG);
18724 case VPERM_2OP_MASK : {
18725 SDValue Src1 = Op.getOperand(1);
18726 SDValue Src2 = Op.getOperand(2);
18727 SDValue PassThru = Op.getOperand(3);
18728 SDValue Mask = Op.getOperand(4);
18730 // Swap Src1 and Src2 in the node creation
18731 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
18732 Mask, PassThru, Subtarget, DAG);
18734 case VPERM_3OP_MASKZ:
18735 case VPERM_3OP_MASK:{
18736 MVT VT = Op.getSimpleValueType();
18737 // Src2 is the PassThru
18738 SDValue Src1 = Op.getOperand(1);
18739 // PassThru needs to be the same type as the destination in order
18740 // to pattern match correctly.
18741 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
18742 SDValue Src3 = Op.getOperand(3);
18743 SDValue Mask = Op.getOperand(4);
18744 SDValue PassThru = SDValue();
18746 // set PassThru element
18747 if (IntrData->Type == VPERM_3OP_MASKZ)
18748 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18752 // Swap Src1 and Src2 in the node creation
18753 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18754 dl, Op.getValueType(),
18756 Mask, PassThru, Subtarget, DAG);
18760 case FMA_OP_MASK: {
18761 SDValue Src1 = Op.getOperand(1);
18762 SDValue Src2 = Op.getOperand(2);
18763 SDValue Src3 = Op.getOperand(3);
18764 SDValue Mask = Op.getOperand(4);
18765 MVT VT = Op.getSimpleValueType();
18766 SDValue PassThru = SDValue();
18768 // set PassThru element
18769 if (IntrData->Type == FMA_OP_MASKZ)
18770 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18771 else if (IntrData->Type == FMA_OP_MASK3)
18776 // We specify 2 possible opcodes for intrinsics with rounding modes.
18777 // First, we check if the intrinsic may have non-default rounding mode,
18778 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18779 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18780 if (IntrWithRoundingModeOpcode != 0) {
18781 SDValue Rnd = Op.getOperand(5);
18782 if (!isRoundModeCurDirection(Rnd))
18783 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18784 dl, Op.getValueType(),
18785 Src1, Src2, Src3, Rnd),
18786 Mask, PassThru, Subtarget, DAG);
18788 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
18789 dl, Op.getValueType(),
18791 Mask, PassThru, Subtarget, DAG);
18793 case FMA_OP_SCALAR_MASK:
18794 case FMA_OP_SCALAR_MASK3:
18795 case FMA_OP_SCALAR_MASKZ: {
18796 SDValue Src1 = Op.getOperand(1);
18797 SDValue Src2 = Op.getOperand(2);
18798 SDValue Src3 = Op.getOperand(3);
18799 SDValue Mask = Op.getOperand(4);
18800 MVT VT = Op.getSimpleValueType();
18801 SDValue PassThru = SDValue();
18803 // set PassThru element
18804 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
18805 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18806 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
18811 SDValue Rnd = Op.getOperand(5);
18812 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
18813 Op.getValueType(), Src1, Src2,
18815 Mask, PassThru, Subtarget, DAG);
18817 case TERLOG_OP_MASK:
18818 case TERLOG_OP_MASKZ: {
18819 SDValue Src1 = Op.getOperand(1);
18820 SDValue Src2 = Op.getOperand(2);
18821 SDValue Src3 = Op.getOperand(3);
18822 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
18823 SDValue Mask = Op.getOperand(5);
18824 MVT VT = Op.getSimpleValueType();
18825 SDValue PassThru = Src1;
18826 // Set PassThru element.
18827 if (IntrData->Type == TERLOG_OP_MASKZ)
18828 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
18830 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18831 Src1, Src2, Src3, Src4),
18832 Mask, PassThru, Subtarget, DAG);
18835 // ISD::FP_ROUND has a second argument that indicates if the truncation
18836 // does not change the value. Set it to 0 since it can change.
18837 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
18838 DAG.getIntPtrConstant(0, dl));
18839 case CVTPD2PS_MASK: {
18840 SDValue Src = Op.getOperand(1);
18841 SDValue PassThru = Op.getOperand(2);
18842 SDValue Mask = Op.getOperand(3);
18843 // We add rounding mode to the Node when
18844 // - RM Opcode is specified and
18845 // - RM is not "current direction".
18846 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18847 if (IntrWithRoundingModeOpcode != 0) {
18848 SDValue Rnd = Op.getOperand(4);
18849 if (!isRoundModeCurDirection(Rnd)) {
18850 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18851 dl, Op.getValueType(),
18853 Mask, PassThru, Subtarget, DAG);
18856 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
18857 // ISD::FP_ROUND has a second argument that indicates if the truncation
18858 // does not change the value. Set it to 0 since it can change.
18859 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18860 DAG.getIntPtrConstant(0, dl)),
18861 Mask, PassThru, Subtarget, DAG);
18864 // FPclass intrinsics with mask
18865 SDValue Src1 = Op.getOperand(1);
18866 MVT VT = Src1.getSimpleValueType();
18867 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18868 SDValue Imm = Op.getOperand(2);
18869 SDValue Mask = Op.getOperand(3);
18870 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18871 Mask.getSimpleValueType().getSizeInBits());
18872 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
18873 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
18874 DAG.getTargetConstant(0, dl, MaskVT),
18876 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18877 DAG.getUNDEF(BitcastVT), FPclassMask,
18878 DAG.getIntPtrConstant(0, dl));
18879 return DAG.getBitcast(Op.getValueType(), Res);
18882 SDValue Src1 = Op.getOperand(1);
18883 SDValue Imm = Op.getOperand(2);
18884 SDValue Mask = Op.getOperand(3);
18885 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
18886 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
18887 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
18888 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
18891 case CMP_MASK_CC: {
18892 // Comparison intrinsics with masks.
18893 // Example of transformation:
18894 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
18895 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
18897 // (v8i1 (insert_subvector undef,
18898 // (v2i1 (and (PCMPEQM %a, %b),
18899 // (extract_subvector
18900 // (v8i1 (bitcast %mask)), 0))), 0))))
18901 MVT VT = Op.getOperand(1).getSimpleValueType();
18902 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18903 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
18904 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18905 Mask.getSimpleValueType().getSizeInBits());
18907 if (IntrData->Type == CMP_MASK_CC) {
18908 SDValue CC = Op.getOperand(3);
18909 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
18910 // We specify 2 possible opcodes for intrinsics with rounding modes.
18911 // First, we check if the intrinsic may have non-default rounding mode,
18912 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18913 if (IntrData->Opc1 != 0) {
18914 SDValue Rnd = Op.getOperand(5);
18915 if (!isRoundModeCurDirection(Rnd))
18916 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
18917 Op.getOperand(2), CC, Rnd);
18919 //default rounding mode
18921 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
18922 Op.getOperand(2), CC);
18925 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
18926 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
18929 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
18930 DAG.getTargetConstant(0, dl,
18933 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
18934 DAG.getUNDEF(BitcastVT), CmpMask,
18935 DAG.getIntPtrConstant(0, dl));
18936 return DAG.getBitcast(Op.getValueType(), Res);
18938 case CMP_MASK_SCALAR_CC: {
18939 SDValue Src1 = Op.getOperand(1);
18940 SDValue Src2 = Op.getOperand(2);
18941 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
18942 SDValue Mask = Op.getOperand(4);
18945 if (IntrData->Opc1 != 0) {
18946 SDValue Rnd = Op.getOperand(5);
18947 if (!isRoundModeCurDirection(Rnd))
18948 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
18950 //default rounding mode
18952 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
18954 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
18955 DAG.getTargetConstant(0, dl,
18959 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
18961 case COMI: { // Comparison intrinsics
18962 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
18963 SDValue LHS = Op.getOperand(1);
18964 SDValue RHS = Op.getOperand(2);
18965 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
18966 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
18969 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
18970 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
18971 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
18972 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
18975 case ISD::SETNE: { // (ZF = 1 or PF = 1)
18976 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
18977 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
18978 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
18981 case ISD::SETGT: // (CF = 0 and ZF = 0)
18982 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
18984 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
18985 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
18988 case ISD::SETGE: // CF = 0
18989 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
18991 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
18992 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
18995 llvm_unreachable("Unexpected illegal condition!");
18997 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
18999 case COMI_RM: { // Comparison intrinsics with Sae
19000 SDValue LHS = Op.getOperand(1);
19001 SDValue RHS = Op.getOperand(2);
19002 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19003 SDValue Sae = Op.getOperand(4);
19006 if (isRoundModeCurDirection(Sae))
19007 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19008 DAG.getConstant(CondVal, dl, MVT::i8));
19010 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19011 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19012 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19013 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19016 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19017 Op.getOperand(1), Op.getOperand(2), DAG);
19018 case COMPRESS_EXPAND_IN_REG: {
19019 SDValue Mask = Op.getOperand(3);
19020 SDValue DataToCompress = Op.getOperand(1);
19021 SDValue PassThru = Op.getOperand(2);
19022 if (isAllOnesConstant(Mask)) // return data as is
19023 return Op.getOperand(1);
19025 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19027 Mask, PassThru, Subtarget, DAG);
19030 SDValue Mask = Op.getOperand(1);
19031 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19032 Mask.getSimpleValueType().getSizeInBits());
19033 Mask = DAG.getBitcast(MaskVT, Mask);
19034 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19037 MVT VT = Op.getSimpleValueType();
19038 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19040 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19041 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19042 // Arguments should be swapped.
19043 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19044 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19046 return DAG.getBitcast(VT, Res);
19049 case FIXUPIMMS_MASKZ:
19051 case FIXUPIMM_MASKZ:{
19052 SDValue Src1 = Op.getOperand(1);
19053 SDValue Src2 = Op.getOperand(2);
19054 SDValue Src3 = Op.getOperand(3);
19055 SDValue Imm = Op.getOperand(4);
19056 SDValue Mask = Op.getOperand(5);
19057 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19058 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19059 // We specify 2 possible modes for intrinsics, with/without rounding
19061 // First, we check if the intrinsic have rounding mode (7 operands),
19062 // if not, we set rounding mode to "current".
19064 if (Op.getNumOperands() == 7)
19065 Rnd = Op.getOperand(6);
19067 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19068 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19069 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19070 Src1, Src2, Src3, Imm, Rnd),
19071 Mask, Passthru, Subtarget, DAG);
19072 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19073 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19074 Src1, Src2, Src3, Imm, Rnd),
19075 Mask, Passthru, Subtarget, DAG);
19077 case CONVERT_TO_MASK: {
19078 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19079 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19080 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19082 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19084 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19085 DAG.getUNDEF(BitcastVT), CvtMask,
19086 DAG.getIntPtrConstant(0, dl));
19087 return DAG.getBitcast(Op.getValueType(), Res);
19089 case CONVERT_MASK_TO_VEC: {
19090 SDValue Mask = Op.getOperand(1);
19091 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19092 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19093 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19095 case BRCST_SUBVEC_TO_VEC: {
19096 SDValue Src = Op.getOperand(1);
19097 SDValue Passthru = Op.getOperand(2);
19098 SDValue Mask = Op.getOperand(3);
19099 EVT resVT = Passthru.getValueType();
19100 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19101 DAG.getUNDEF(resVT), Src,
19102 DAG.getIntPtrConstant(0, dl));
19104 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19105 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19107 immVal = DAG.getConstant(0, dl, MVT::i8);
19108 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19109 subVec, subVec, immVal),
19110 Mask, Passthru, Subtarget, DAG);
19112 case BRCST32x2_TO_VEC: {
19113 SDValue Src = Op.getOperand(1);
19114 SDValue PassThru = Op.getOperand(2);
19115 SDValue Mask = Op.getOperand(3);
19117 assert((VT.getScalarType() == MVT::i32 ||
19118 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19119 //bitcast Src to packed 64
19120 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19121 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19122 Src = DAG.getBitcast(BitcastVT, Src);
19124 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19125 Mask, PassThru, Subtarget, DAG);
19133 default: return SDValue(); // Don't custom lower most intrinsics.
19135 case Intrinsic::x86_avx2_permd:
19136 case Intrinsic::x86_avx2_permps:
19137 // Operands intentionally swapped. Mask is last operand to intrinsic,
19138 // but second operand for node/instruction.
19139 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19140 Op.getOperand(2), Op.getOperand(1));
19142 // ptest and testp intrinsics. The intrinsic these come from are designed to
19143 // return an integer value, not just an instruction so lower it to the ptest
19144 // or testp pattern and a setcc for the result.
19145 case Intrinsic::x86_sse41_ptestz:
19146 case Intrinsic::x86_sse41_ptestc:
19147 case Intrinsic::x86_sse41_ptestnzc:
19148 case Intrinsic::x86_avx_ptestz_256:
19149 case Intrinsic::x86_avx_ptestc_256:
19150 case Intrinsic::x86_avx_ptestnzc_256:
19151 case Intrinsic::x86_avx_vtestz_ps:
19152 case Intrinsic::x86_avx_vtestc_ps:
19153 case Intrinsic::x86_avx_vtestnzc_ps:
19154 case Intrinsic::x86_avx_vtestz_pd:
19155 case Intrinsic::x86_avx_vtestc_pd:
19156 case Intrinsic::x86_avx_vtestnzc_pd:
19157 case Intrinsic::x86_avx_vtestz_ps_256:
19158 case Intrinsic::x86_avx_vtestc_ps_256:
19159 case Intrinsic::x86_avx_vtestnzc_ps_256:
19160 case Intrinsic::x86_avx_vtestz_pd_256:
19161 case Intrinsic::x86_avx_vtestc_pd_256:
19162 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19163 bool IsTestPacked = false;
19164 X86::CondCode X86CC;
19166 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19167 case Intrinsic::x86_avx_vtestz_ps:
19168 case Intrinsic::x86_avx_vtestz_pd:
19169 case Intrinsic::x86_avx_vtestz_ps_256:
19170 case Intrinsic::x86_avx_vtestz_pd_256:
19171 IsTestPacked = true;
19173 case Intrinsic::x86_sse41_ptestz:
19174 case Intrinsic::x86_avx_ptestz_256:
19176 X86CC = X86::COND_E;
19178 case Intrinsic::x86_avx_vtestc_ps:
19179 case Intrinsic::x86_avx_vtestc_pd:
19180 case Intrinsic::x86_avx_vtestc_ps_256:
19181 case Intrinsic::x86_avx_vtestc_pd_256:
19182 IsTestPacked = true;
19184 case Intrinsic::x86_sse41_ptestc:
19185 case Intrinsic::x86_avx_ptestc_256:
19187 X86CC = X86::COND_B;
19189 case Intrinsic::x86_avx_vtestnzc_ps:
19190 case Intrinsic::x86_avx_vtestnzc_pd:
19191 case Intrinsic::x86_avx_vtestnzc_ps_256:
19192 case Intrinsic::x86_avx_vtestnzc_pd_256:
19193 IsTestPacked = true;
19195 case Intrinsic::x86_sse41_ptestnzc:
19196 case Intrinsic::x86_avx_ptestnzc_256:
19198 X86CC = X86::COND_A;
19202 SDValue LHS = Op.getOperand(1);
19203 SDValue RHS = Op.getOperand(2);
19204 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19205 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19206 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19207 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19209 case Intrinsic::x86_avx512_kortestz_w:
19210 case Intrinsic::x86_avx512_kortestc_w: {
19211 X86::CondCode X86CC =
19212 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19213 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19214 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19215 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19216 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19217 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19220 case Intrinsic::x86_sse42_pcmpistria128:
19221 case Intrinsic::x86_sse42_pcmpestria128:
19222 case Intrinsic::x86_sse42_pcmpistric128:
19223 case Intrinsic::x86_sse42_pcmpestric128:
19224 case Intrinsic::x86_sse42_pcmpistrio128:
19225 case Intrinsic::x86_sse42_pcmpestrio128:
19226 case Intrinsic::x86_sse42_pcmpistris128:
19227 case Intrinsic::x86_sse42_pcmpestris128:
19228 case Intrinsic::x86_sse42_pcmpistriz128:
19229 case Intrinsic::x86_sse42_pcmpestriz128: {
19231 X86::CondCode X86CC;
19233 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19234 case Intrinsic::x86_sse42_pcmpistria128:
19235 Opcode = X86ISD::PCMPISTRI;
19236 X86CC = X86::COND_A;
19238 case Intrinsic::x86_sse42_pcmpestria128:
19239 Opcode = X86ISD::PCMPESTRI;
19240 X86CC = X86::COND_A;
19242 case Intrinsic::x86_sse42_pcmpistric128:
19243 Opcode = X86ISD::PCMPISTRI;
19244 X86CC = X86::COND_B;
19246 case Intrinsic::x86_sse42_pcmpestric128:
19247 Opcode = X86ISD::PCMPESTRI;
19248 X86CC = X86::COND_B;
19250 case Intrinsic::x86_sse42_pcmpistrio128:
19251 Opcode = X86ISD::PCMPISTRI;
19252 X86CC = X86::COND_O;
19254 case Intrinsic::x86_sse42_pcmpestrio128:
19255 Opcode = X86ISD::PCMPESTRI;
19256 X86CC = X86::COND_O;
19258 case Intrinsic::x86_sse42_pcmpistris128:
19259 Opcode = X86ISD::PCMPISTRI;
19260 X86CC = X86::COND_S;
19262 case Intrinsic::x86_sse42_pcmpestris128:
19263 Opcode = X86ISD::PCMPESTRI;
19264 X86CC = X86::COND_S;
19266 case Intrinsic::x86_sse42_pcmpistriz128:
19267 Opcode = X86ISD::PCMPISTRI;
19268 X86CC = X86::COND_E;
19270 case Intrinsic::x86_sse42_pcmpestriz128:
19271 Opcode = X86ISD::PCMPESTRI;
19272 X86CC = X86::COND_E;
19275 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19276 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19277 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19278 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19279 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19282 case Intrinsic::x86_sse42_pcmpistri128:
19283 case Intrinsic::x86_sse42_pcmpestri128: {
19285 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19286 Opcode = X86ISD::PCMPISTRI;
19288 Opcode = X86ISD::PCMPESTRI;
19290 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19291 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19292 return DAG.getNode(Opcode, dl, VTs, NewOps);
19295 case Intrinsic::eh_sjlj_lsda: {
19296 MachineFunction &MF = DAG.getMachineFunction();
19297 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19298 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19299 auto &Context = MF.getMMI().getContext();
19300 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19301 Twine(MF.getFunctionNumber()));
19302 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19305 case Intrinsic::x86_seh_lsda: {
19306 // Compute the symbol for the LSDA. We know it'll get emitted later.
19307 MachineFunction &MF = DAG.getMachineFunction();
19308 SDValue Op1 = Op.getOperand(1);
19309 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19310 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19311 GlobalValue::getRealLinkageName(Fn->getName()));
19313 // Generate a simple absolute symbol reference. This intrinsic is only
19314 // supported on 32-bit Windows, which isn't PIC.
19315 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19316 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19319 case Intrinsic::x86_seh_recoverfp: {
19320 SDValue FnOp = Op.getOperand(1);
19321 SDValue IncomingFPOp = Op.getOperand(2);
19322 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19323 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19325 report_fatal_error(
19326 "llvm.x86.seh.recoverfp must take a function as the first argument");
19327 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19330 case Intrinsic::localaddress: {
19331 // Returns one of the stack, base, or frame pointer registers, depending on
19332 // which is used to reference local variables.
19333 MachineFunction &MF = DAG.getMachineFunction();
19334 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19336 if (RegInfo->hasBasePointer(MF))
19337 Reg = RegInfo->getBaseRegister();
19338 else // This function handles the SP or FP case.
19339 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19340 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
19345 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19346 SDValue Src, SDValue Mask, SDValue Base,
19347 SDValue Index, SDValue ScaleOp, SDValue Chain,
19348 const X86Subtarget &Subtarget) {
19350 auto *C = cast<ConstantSDNode>(ScaleOp);
19351 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19352 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19353 Index.getSimpleValueType().getVectorNumElements());
19355 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19356 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
19357 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19358 SDValue Segment = DAG.getRegister(0, MVT::i32);
19360 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
19361 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
19362 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19363 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
19364 return DAG.getMergeValues(RetOps, dl);
19367 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19368 SDValue Src, SDValue Mask, SDValue Base,
19369 SDValue Index, SDValue ScaleOp, SDValue Chain,
19370 const X86Subtarget &Subtarget) {
19372 auto *C = cast<ConstantSDNode>(ScaleOp);
19373 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19374 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19375 SDValue Segment = DAG.getRegister(0, MVT::i32);
19376 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19377 Index.getSimpleValueType().getVectorNumElements());
19379 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19380 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
19381 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
19382 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19383 return SDValue(Res, 1);
19386 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19387 SDValue Mask, SDValue Base, SDValue Index,
19388 SDValue ScaleOp, SDValue Chain,
19389 const X86Subtarget &Subtarget) {
19391 auto *C = cast<ConstantSDNode>(ScaleOp);
19392 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19393 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19394 SDValue Segment = DAG.getRegister(0, MVT::i32);
19396 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
19397 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19398 //SDVTList VTs = DAG.getVTList(MVT::Other);
19399 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
19400 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
19401 return SDValue(Res, 0);
19404 /// Handles the lowering of builtin intrinsic that return the value
19405 /// of the extended control register.
19406 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
19408 const X86Subtarget &Subtarget,
19409 SmallVectorImpl<SDValue> &Results) {
19410 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19411 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19414 // The ECX register is used to select the index of the XCR register to
19417 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
19418 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
19419 Chain = SDValue(N1, 0);
19421 // Reads the content of XCR and returns it in registers EDX:EAX.
19422 if (Subtarget.is64Bit()) {
19423 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
19424 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19427 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
19428 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19431 Chain = HI.getValue(1);
19433 if (Subtarget.is64Bit()) {
19434 // Merge the two 32-bit values into a 64-bit one..
19435 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19436 DAG.getConstant(32, DL, MVT::i8));
19437 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19438 Results.push_back(Chain);
19442 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19443 SDValue Ops[] = { LO, HI };
19444 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19445 Results.push_back(Pair);
19446 Results.push_back(Chain);
19449 /// Handles the lowering of builtin intrinsics that read performance monitor
19450 /// counters (x86_rdpmc).
19451 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
19453 const X86Subtarget &Subtarget,
19454 SmallVectorImpl<SDValue> &Results) {
19455 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19456 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19459 // The ECX register is used to select the index of the performance counter
19461 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
19463 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
19465 // Reads the content of a 64-bit performance counter and returns it in the
19466 // registers EDX:EAX.
19467 if (Subtarget.is64Bit()) {
19468 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19469 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19472 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19473 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19476 Chain = HI.getValue(1);
19478 if (Subtarget.is64Bit()) {
19479 // The EAX register is loaded with the low-order 32 bits. The EDX register
19480 // is loaded with the supported high-order bits of the counter.
19481 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19482 DAG.getConstant(32, DL, MVT::i8));
19483 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19484 Results.push_back(Chain);
19488 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19489 SDValue Ops[] = { LO, HI };
19490 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19491 Results.push_back(Pair);
19492 Results.push_back(Chain);
19495 /// Handles the lowering of builtin intrinsics that read the time stamp counter
19496 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
19497 /// READCYCLECOUNTER nodes.
19498 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
19500 const X86Subtarget &Subtarget,
19501 SmallVectorImpl<SDValue> &Results) {
19502 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19503 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
19506 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
19507 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
19508 // and the EAX register is loaded with the low-order 32 bits.
19509 if (Subtarget.is64Bit()) {
19510 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19511 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19514 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19515 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19518 SDValue Chain = HI.getValue(1);
19520 if (Opcode == X86ISD::RDTSCP_DAG) {
19521 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19523 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
19524 // the ECX register. Add 'ecx' explicitly to the chain.
19525 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
19527 // Explicitly store the content of ECX at the location passed in input
19528 // to the 'rdtscp' intrinsic.
19529 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
19530 MachinePointerInfo());
19533 if (Subtarget.is64Bit()) {
19534 // The EDX register is loaded with the high-order 32 bits of the MSR, and
19535 // the EAX register is loaded with the low-order 32 bits.
19536 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19537 DAG.getConstant(32, DL, MVT::i8));
19538 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19539 Results.push_back(Chain);
19543 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19544 SDValue Ops[] = { LO, HI };
19545 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19546 Results.push_back(Pair);
19547 Results.push_back(Chain);
19550 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
19551 SelectionDAG &DAG) {
19552 SmallVector<SDValue, 2> Results;
19554 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
19556 return DAG.getMergeValues(Results, DL);
19559 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
19560 MachineFunction &MF = DAG.getMachineFunction();
19561 SDValue Chain = Op.getOperand(0);
19562 SDValue RegNode = Op.getOperand(2);
19563 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19565 report_fatal_error("EH registrations only live in functions using WinEH");
19567 // Cast the operand to an alloca, and remember the frame index.
19568 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
19570 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
19571 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
19573 // Return the chain operand without making any DAG nodes.
19577 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
19578 MachineFunction &MF = DAG.getMachineFunction();
19579 SDValue Chain = Op.getOperand(0);
19580 SDValue EHGuard = Op.getOperand(2);
19581 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19583 report_fatal_error("EHGuard only live in functions using WinEH");
19585 // Cast the operand to an alloca, and remember the frame index.
19586 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
19588 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
19589 EHInfo->EHGuardFrameIndex = FINode->getIndex();
19591 // Return the chain operand without making any DAG nodes.
19595 /// Emit Truncating Store with signed or unsigned saturation.
19597 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
19598 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
19599 SelectionDAG &DAG) {
19601 SDVTList VTs = DAG.getVTList(MVT::Other);
19602 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
19603 SDValue Ops[] = { Chain, Val, Ptr, Undef };
19605 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19606 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19609 /// Emit Masked Truncating Store with signed or unsigned saturation.
19611 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
19612 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
19613 MachineMemOperand *MMO, SelectionDAG &DAG) {
19615 SDVTList VTs = DAG.getVTList(MVT::Other);
19616 SDValue Ops[] = { Chain, Ptr, Mask, Val };
19618 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19619 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19622 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19623 SelectionDAG &DAG) {
19624 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
19626 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
19628 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
19629 return MarkEHRegistrationNode(Op, DAG);
19630 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
19631 return MarkEHGuard(Op, DAG);
19632 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
19633 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
19634 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
19635 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
19636 // We need a frame pointer because this will get lowered to a PUSH/POP
19638 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19639 MFI.setHasCopyImplyingStackAdjustment(true);
19640 // Don't do anything here, we will expand these intrinsics out later
19641 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
19648 switch(IntrData->Type) {
19649 default: llvm_unreachable("Unknown Intrinsic Type");
19652 // Emit the node with the right value type.
19653 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
19654 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19656 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
19657 // Otherwise return the value from Rand, which is always 0, casted to i32.
19658 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
19659 DAG.getConstant(1, dl, Op->getValueType(1)),
19660 DAG.getConstant(X86::COND_B, dl, MVT::i32),
19661 SDValue(Result.getNode(), 1) };
19662 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
19663 DAG.getVTList(Op->getValueType(1), MVT::Glue),
19666 // Return { result, isValid, chain }.
19667 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
19668 SDValue(Result.getNode(), 2));
19671 //gather(v1, mask, index, base, scale);
19672 SDValue Chain = Op.getOperand(0);
19673 SDValue Src = Op.getOperand(2);
19674 SDValue Base = Op.getOperand(3);
19675 SDValue Index = Op.getOperand(4);
19676 SDValue Mask = Op.getOperand(5);
19677 SDValue Scale = Op.getOperand(6);
19678 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
19682 //scatter(base, mask, index, v1, scale);
19683 SDValue Chain = Op.getOperand(0);
19684 SDValue Base = Op.getOperand(2);
19685 SDValue Mask = Op.getOperand(3);
19686 SDValue Index = Op.getOperand(4);
19687 SDValue Src = Op.getOperand(5);
19688 SDValue Scale = Op.getOperand(6);
19689 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
19690 Scale, Chain, Subtarget);
19693 SDValue Hint = Op.getOperand(6);
19694 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
19695 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
19696 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
19697 SDValue Chain = Op.getOperand(0);
19698 SDValue Mask = Op.getOperand(2);
19699 SDValue Index = Op.getOperand(3);
19700 SDValue Base = Op.getOperand(4);
19701 SDValue Scale = Op.getOperand(5);
19702 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
19705 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
19707 SmallVector<SDValue, 2> Results;
19708 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
19710 return DAG.getMergeValues(Results, dl);
19712 // Read Performance Monitoring Counters.
19714 SmallVector<SDValue, 2> Results;
19715 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
19716 return DAG.getMergeValues(Results, dl);
19718 // Get Extended Control Register.
19720 SmallVector<SDValue, 2> Results;
19721 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
19722 return DAG.getMergeValues(Results, dl);
19724 // XTEST intrinsics.
19726 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19727 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19729 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
19730 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
19731 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
19732 Ret, SDValue(InTrans.getNode(), 1));
19736 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19737 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
19738 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
19739 DAG.getConstant(-1, dl, MVT::i8));
19740 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
19741 Op.getOperand(4), GenCF.getValue(1));
19742 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
19743 Op.getOperand(5), MachinePointerInfo());
19744 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
19745 SDValue Results[] = { SetCC, Store };
19746 return DAG.getMergeValues(Results, dl);
19748 case COMPRESS_TO_MEM: {
19749 SDValue Mask = Op.getOperand(4);
19750 SDValue DataToCompress = Op.getOperand(3);
19751 SDValue Addr = Op.getOperand(2);
19752 SDValue Chain = Op.getOperand(0);
19753 MVT VT = DataToCompress.getSimpleValueType();
19755 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19756 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19758 if (isAllOnesConstant(Mask)) // return just a store
19759 return DAG.getStore(Chain, dl, DataToCompress, Addr,
19760 MemIntr->getMemOperand());
19762 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19763 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19765 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
19766 MemIntr->getMemOperand(),
19767 false /* truncating */, true /* compressing */);
19769 case TRUNCATE_TO_MEM_VI8:
19770 case TRUNCATE_TO_MEM_VI16:
19771 case TRUNCATE_TO_MEM_VI32: {
19772 SDValue Mask = Op.getOperand(4);
19773 SDValue DataToTruncate = Op.getOperand(3);
19774 SDValue Addr = Op.getOperand(2);
19775 SDValue Chain = Op.getOperand(0);
19777 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19778 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19780 EVT MemVT = MemIntr->getMemoryVT();
19782 uint16_t TruncationOp = IntrData->Opc0;
19783 switch (TruncationOp) {
19784 case X86ISD::VTRUNC: {
19785 if (isAllOnesConstant(Mask)) // return just a truncate store
19786 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
19787 MemIntr->getMemOperand());
19789 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19790 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19792 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
19793 MemIntr->getMemOperand(), true /* truncating */);
19795 case X86ISD::VTRUNCUS:
19796 case X86ISD::VTRUNCS: {
19797 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
19798 if (isAllOnesConstant(Mask))
19799 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
19800 MemIntr->getMemOperand(), DAG);
19802 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
19803 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19805 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
19806 VMask, MemVT, MemIntr->getMemOperand(), DAG);
19809 llvm_unreachable("Unsupported truncstore intrinsic");
19813 case EXPAND_FROM_MEM: {
19814 SDValue Mask = Op.getOperand(4);
19815 SDValue PassThru = Op.getOperand(3);
19816 SDValue Addr = Op.getOperand(2);
19817 SDValue Chain = Op.getOperand(0);
19818 MVT VT = Op.getSimpleValueType();
19820 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
19821 assert(MemIntr && "Expected MemIntrinsicSDNode!");
19823 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
19824 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
19825 if (X86::isZeroNode(Mask))
19826 return DAG.getUNDEF(VT);
19828 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19829 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19830 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
19831 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
19832 true /* expanding */);
19837 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
19838 SelectionDAG &DAG) const {
19839 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19840 MFI.setReturnAddressIsTaken(true);
19842 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
19845 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19847 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19850 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
19851 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19852 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
19853 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
19854 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
19855 MachinePointerInfo());
19858 // Just load the return address.
19859 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
19860 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
19861 MachinePointerInfo());
19864 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
19865 SelectionDAG &DAG) const {
19866 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
19867 return getReturnAddressFrameIndex(DAG);
19870 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
19871 MachineFunction &MF = DAG.getMachineFunction();
19872 MachineFrameInfo &MFI = MF.getFrameInfo();
19873 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
19874 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19875 EVT VT = Op.getValueType();
19877 MFI.setFrameAddressIsTaken(true);
19879 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
19880 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
19881 // is not possible to crawl up the stack without looking at the unwind codes
19883 int FrameAddrIndex = FuncInfo->getFAIndex();
19884 if (!FrameAddrIndex) {
19885 // Set up a frame object for the return address.
19886 unsigned SlotSize = RegInfo->getSlotSize();
19887 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
19888 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
19889 FuncInfo->setFAIndex(FrameAddrIndex);
19891 return DAG.getFrameIndex(FrameAddrIndex, VT);
19894 unsigned FrameReg =
19895 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19896 SDLoc dl(Op); // FIXME probably not meaningful
19897 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19898 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
19899 (FrameReg == X86::EBP && VT == MVT::i32)) &&
19900 "Invalid Frame Register!");
19901 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
19903 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
19904 MachinePointerInfo());
19908 // FIXME? Maybe this could be a TableGen attribute on some registers and
19909 // this table could be generated automatically from RegInfo.
19910 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
19911 SelectionDAG &DAG) const {
19912 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
19913 const MachineFunction &MF = DAG.getMachineFunction();
19915 unsigned Reg = StringSwitch<unsigned>(RegName)
19916 .Case("esp", X86::ESP)
19917 .Case("rsp", X86::RSP)
19918 .Case("ebp", X86::EBP)
19919 .Case("rbp", X86::RBP)
19922 if (Reg == X86::EBP || Reg == X86::RBP) {
19923 if (!TFI.hasFP(MF))
19924 report_fatal_error("register " + StringRef(RegName) +
19925 " is allocatable: function has no frame pointer");
19928 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19929 unsigned FrameReg =
19930 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
19931 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
19932 "Invalid Frame Register!");
19940 report_fatal_error("Invalid register name global variable");
19943 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
19944 SelectionDAG &DAG) const {
19945 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19946 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
19949 unsigned X86TargetLowering::getExceptionPointerRegister(
19950 const Constant *PersonalityFn) const {
19951 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
19952 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
19954 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
19957 unsigned X86TargetLowering::getExceptionSelectorRegister(
19958 const Constant *PersonalityFn) const {
19959 // Funclet personalities don't use selectors (the runtime does the selection).
19960 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
19961 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
19964 bool X86TargetLowering::needsFixedCatchObjects() const {
19965 return Subtarget.isTargetWin64();
19968 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
19969 SDValue Chain = Op.getOperand(0);
19970 SDValue Offset = Op.getOperand(1);
19971 SDValue Handler = Op.getOperand(2);
19974 EVT PtrVT = getPointerTy(DAG.getDataLayout());
19975 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19976 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
19977 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
19978 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
19979 "Invalid Frame Register!");
19980 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
19981 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
19983 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
19984 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
19986 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
19987 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
19988 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
19990 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
19991 DAG.getRegister(StoreAddrReg, PtrVT));
19994 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
19995 SelectionDAG &DAG) const {
19997 // If the subtarget is not 64bit, we may need the global base reg
19998 // after isel expand pseudo, i.e., after CGBR pass ran.
19999 // Therefore, ask for the GlobalBaseReg now, so that the pass
20000 // inserts the code for us in case we need it.
20001 // Otherwise, we will end up in a situation where we will
20002 // reference a virtual register that is not defined!
20003 if (!Subtarget.is64Bit()) {
20004 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20005 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20007 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20008 DAG.getVTList(MVT::i32, MVT::Other),
20009 Op.getOperand(0), Op.getOperand(1));
20012 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20013 SelectionDAG &DAG) const {
20015 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20016 Op.getOperand(0), Op.getOperand(1));
20019 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20020 SelectionDAG &DAG) const {
20022 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20026 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20027 return Op.getOperand(0);
20030 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20031 SelectionDAG &DAG) const {
20032 SDValue Root = Op.getOperand(0);
20033 SDValue Trmp = Op.getOperand(1); // trampoline
20034 SDValue FPtr = Op.getOperand(2); // nested function
20035 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20038 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20039 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20041 if (Subtarget.is64Bit()) {
20042 SDValue OutChains[6];
20044 // Large code-model.
20045 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20046 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20048 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20049 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20051 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20053 // Load the pointer to the nested function into R11.
20054 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20055 SDValue Addr = Trmp;
20056 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20057 Addr, MachinePointerInfo(TrmpAddr));
20059 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20060 DAG.getConstant(2, dl, MVT::i64));
20062 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20063 /* Alignment = */ 2);
20065 // Load the 'nest' parameter value into R10.
20066 // R10 is specified in X86CallingConv.td
20067 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20068 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20069 DAG.getConstant(10, dl, MVT::i64));
20070 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20071 Addr, MachinePointerInfo(TrmpAddr, 10));
20073 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20074 DAG.getConstant(12, dl, MVT::i64));
20076 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20077 /* Alignment = */ 2);
20079 // Jump to the nested function.
20080 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20081 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20082 DAG.getConstant(20, dl, MVT::i64));
20083 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20084 Addr, MachinePointerInfo(TrmpAddr, 20));
20086 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20087 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20088 DAG.getConstant(22, dl, MVT::i64));
20089 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20090 Addr, MachinePointerInfo(TrmpAddr, 22));
20092 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20094 const Function *Func =
20095 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20096 CallingConv::ID CC = Func->getCallingConv();
20101 llvm_unreachable("Unsupported calling convention");
20102 case CallingConv::C:
20103 case CallingConv::X86_StdCall: {
20104 // Pass 'nest' parameter in ECX.
20105 // Must be kept in sync with X86CallingConv.td
20106 NestReg = X86::ECX;
20108 // Check that ECX wasn't needed by an 'inreg' parameter.
20109 FunctionType *FTy = Func->getFunctionType();
20110 const AttributeSet &Attrs = Func->getAttributes();
20112 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20113 unsigned InRegCount = 0;
20116 for (FunctionType::param_iterator I = FTy->param_begin(),
20117 E = FTy->param_end(); I != E; ++I, ++Idx)
20118 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20119 auto &DL = DAG.getDataLayout();
20120 // FIXME: should only count parameters that are lowered to integers.
20121 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20124 if (InRegCount > 2) {
20125 report_fatal_error("Nest register in use - reduce number of inreg"
20131 case CallingConv::X86_FastCall:
20132 case CallingConv::X86_ThisCall:
20133 case CallingConv::Fast:
20134 // Pass 'nest' parameter in EAX.
20135 // Must be kept in sync with X86CallingConv.td
20136 NestReg = X86::EAX;
20140 SDValue OutChains[4];
20141 SDValue Addr, Disp;
20143 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20144 DAG.getConstant(10, dl, MVT::i32));
20145 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20147 // This is storing the opcode for MOV32ri.
20148 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20149 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20151 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20152 Trmp, MachinePointerInfo(TrmpAddr));
20154 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20155 DAG.getConstant(1, dl, MVT::i32));
20157 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20158 /* Alignment = */ 1);
20160 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20161 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20162 DAG.getConstant(5, dl, MVT::i32));
20163 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20164 Addr, MachinePointerInfo(TrmpAddr, 5),
20165 /* Alignment = */ 1);
20167 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20168 DAG.getConstant(6, dl, MVT::i32));
20170 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20171 /* Alignment = */ 1);
20173 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20177 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20178 SelectionDAG &DAG) const {
20180 The rounding mode is in bits 11:10 of FPSR, and has the following
20182 00 Round to nearest
20187 FLT_ROUNDS, on the other hand, expects the following:
20194 To perform the conversion, we do:
20195 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20198 MachineFunction &MF = DAG.getMachineFunction();
20199 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20200 unsigned StackAlignment = TFI.getStackAlignment();
20201 MVT VT = Op.getSimpleValueType();
20204 // Save FP Control Word to stack slot
20205 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20206 SDValue StackSlot =
20207 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20209 MachineMemOperand *MMO =
20210 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20211 MachineMemOperand::MOStore, 2, 2);
20213 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20214 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20215 DAG.getVTList(MVT::Other),
20216 Ops, MVT::i16, MMO);
20218 // Load FP Control Word from stack slot
20220 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20222 // Transform as necessary
20224 DAG.getNode(ISD::SRL, DL, MVT::i16,
20225 DAG.getNode(ISD::AND, DL, MVT::i16,
20226 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20227 DAG.getConstant(11, DL, MVT::i8));
20229 DAG.getNode(ISD::SRL, DL, MVT::i16,
20230 DAG.getNode(ISD::AND, DL, MVT::i16,
20231 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20232 DAG.getConstant(9, DL, MVT::i8));
20235 DAG.getNode(ISD::AND, DL, MVT::i16,
20236 DAG.getNode(ISD::ADD, DL, MVT::i16,
20237 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20238 DAG.getConstant(1, DL, MVT::i16)),
20239 DAG.getConstant(3, DL, MVT::i16));
20241 return DAG.getNode((VT.getSizeInBits() < 16 ?
20242 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20245 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20247 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20248 // to 512-bit vector.
20249 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20250 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20251 // split the vector, perform operation on it's Lo a Hi part and
20252 // concatenate the results.
20253 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
20254 assert(Op.getOpcode() == ISD::CTLZ);
20256 MVT VT = Op.getSimpleValueType();
20257 MVT EltVT = VT.getVectorElementType();
20258 unsigned NumElems = VT.getVectorNumElements();
20260 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
20261 // Extend to 512 bit vector.
20262 assert((VT.is256BitVector() || VT.is128BitVector()) &&
20263 "Unsupported value type for operation");
20265 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
20266 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
20267 DAG.getUNDEF(NewVT),
20269 DAG.getIntPtrConstant(0, dl));
20270 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
20272 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
20273 DAG.getIntPtrConstant(0, dl));
20276 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20277 "Unsupported element type");
20279 if (16 < NumElems) {
20280 // Split vector, it's Lo and Hi parts will be handled in next iteration.
20282 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
20283 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
20285 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
20286 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
20288 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
20291 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
20293 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
20294 "Unsupported value type for operation");
20296 // Use native supported vector instruction vplzcntd.
20297 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
20298 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
20299 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
20300 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
20302 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
20305 // Lower CTLZ using a PSHUFB lookup table implementation.
20306 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
20307 const X86Subtarget &Subtarget,
20308 SelectionDAG &DAG) {
20309 MVT VT = Op.getSimpleValueType();
20310 int NumElts = VT.getVectorNumElements();
20311 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
20312 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
20314 // Per-nibble leading zero PSHUFB lookup table.
20315 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
20316 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
20317 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
20318 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
20320 SmallVector<SDValue, 64> LUTVec;
20321 for (int i = 0; i < NumBytes; ++i)
20322 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20323 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
20325 // Begin by bitcasting the input to byte vector, then split those bytes
20326 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
20327 // If the hi input nibble is zero then we add both results together, otherwise
20328 // we just take the hi result (by masking the lo result to zero before the
20330 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
20331 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
20333 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
20334 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
20335 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
20336 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
20337 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
20339 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
20340 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
20341 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
20342 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
20344 // Merge result back from vXi8 back to VT, working on the lo/hi halves
20345 // of the current vector width in the same way we did for the nibbles.
20346 // If the upper half of the input element is zero then add the halves'
20347 // leading zero counts together, otherwise just use the upper half's.
20348 // Double the width of the result until we are at target width.
20349 while (CurrVT != VT) {
20350 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
20351 int CurrNumElts = CurrVT.getVectorNumElements();
20352 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
20353 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
20354 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
20356 // Check if the upper half of the input element is zero.
20357 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
20358 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
20359 HiZ = DAG.getBitcast(NextVT, HiZ);
20361 // Move the upper/lower halves to the lower bits as we'll be extending to
20362 // NextVT. Mask the lower result to zero if HiZ is true and add the results
20364 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
20365 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
20366 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
20367 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
20368 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
20375 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
20376 const X86Subtarget &Subtarget,
20377 SelectionDAG &DAG) {
20378 MVT VT = Op.getSimpleValueType();
20379 SDValue Op0 = Op.getOperand(0);
20381 if (Subtarget.hasAVX512())
20382 return LowerVectorCTLZ_AVX512(Op, DAG);
20384 // Decompose 256-bit ops into smaller 128-bit ops.
20385 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20386 unsigned NumElems = VT.getVectorNumElements();
20388 // Extract each 128-bit vector, perform ctlz and concat the result.
20389 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
20390 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
20392 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
20393 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
20394 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
20397 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
20398 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
20401 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
20402 SelectionDAG &DAG) {
20403 MVT VT = Op.getSimpleValueType();
20405 unsigned NumBits = VT.getSizeInBits();
20407 unsigned Opc = Op.getOpcode();
20410 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
20412 Op = Op.getOperand(0);
20413 if (VT == MVT::i8) {
20414 // Zero extend to i32 since there is not an i8 bsr.
20416 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
20419 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
20420 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
20421 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
20423 if (Opc == ISD::CTLZ) {
20424 // If src is zero (i.e. bsr sets ZF), returns NumBits.
20427 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
20428 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20431 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
20434 // Finally xor with NumBits-1.
20435 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
20436 DAG.getConstant(NumBits - 1, dl, OpVT));
20439 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
20443 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
20444 MVT VT = Op.getSimpleValueType();
20445 unsigned NumBits = VT.getScalarSizeInBits();
20448 if (VT.isVector()) {
20449 SDValue N0 = Op.getOperand(0);
20450 SDValue Zero = DAG.getConstant(0, dl, VT);
20452 // lsb(x) = (x & -x)
20453 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
20454 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
20456 // cttz_undef(x) = (width - 1) - ctlz(lsb)
20457 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
20458 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
20459 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
20460 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
20463 // cttz(x) = ctpop(lsb - 1)
20464 SDValue One = DAG.getConstant(1, dl, VT);
20465 return DAG.getNode(ISD::CTPOP, dl, VT,
20466 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
20469 assert(Op.getOpcode() == ISD::CTTZ &&
20470 "Only scalar CTTZ requires custom lowering");
20472 // Issue a bsf (scan bits forward) which also sets EFLAGS.
20473 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
20474 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
20476 // If src is zero (i.e. bsf sets ZF), returns NumBits.
20479 DAG.getConstant(NumBits, dl, VT),
20480 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20483 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
20486 /// Break a 256-bit integer operation into two new 128-bit ones and then
20487 /// concatenate the result back.
20488 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
20489 MVT VT = Op.getSimpleValueType();
20491 assert(VT.is256BitVector() && VT.isInteger() &&
20492 "Unsupported value type for operation");
20494 unsigned NumElems = VT.getVectorNumElements();
20497 // Extract the LHS vectors
20498 SDValue LHS = Op.getOperand(0);
20499 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
20500 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
20502 // Extract the RHS vectors
20503 SDValue RHS = Op.getOperand(1);
20504 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
20505 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
20507 MVT EltVT = VT.getVectorElementType();
20508 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20510 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20511 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20512 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20515 /// Break a 512-bit integer operation into two new 256-bit ones and then
20516 /// concatenate the result back.
20517 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
20518 MVT VT = Op.getSimpleValueType();
20520 assert(VT.is512BitVector() && VT.isInteger() &&
20521 "Unsupported value type for operation");
20523 unsigned NumElems = VT.getVectorNumElements();
20526 // Extract the LHS vectors
20527 SDValue LHS = Op.getOperand(0);
20528 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
20529 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
20531 // Extract the RHS vectors
20532 SDValue RHS = Op.getOperand(1);
20533 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
20534 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
20536 MVT EltVT = VT.getVectorElementType();
20537 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20539 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20540 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20541 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20544 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
20545 if (Op.getValueType() == MVT::i1)
20546 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20547 Op.getOperand(0), Op.getOperand(1));
20548 assert(Op.getSimpleValueType().is256BitVector() &&
20549 Op.getSimpleValueType().isInteger() &&
20550 "Only handle AVX 256-bit vector integer operation");
20551 return Lower256IntArith(Op, DAG);
20554 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
20555 if (Op.getValueType() == MVT::i1)
20556 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20557 Op.getOperand(0), Op.getOperand(1));
20558 assert(Op.getSimpleValueType().is256BitVector() &&
20559 Op.getSimpleValueType().isInteger() &&
20560 "Only handle AVX 256-bit vector integer operation");
20561 return Lower256IntArith(Op, DAG);
20564 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
20565 assert(Op.getSimpleValueType().is256BitVector() &&
20566 Op.getSimpleValueType().isInteger() &&
20567 "Only handle AVX 256-bit vector integer operation");
20568 return Lower256IntArith(Op, DAG);
20571 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
20572 SelectionDAG &DAG) {
20574 MVT VT = Op.getSimpleValueType();
20577 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
20579 // Decompose 256-bit ops into smaller 128-bit ops.
20580 if (VT.is256BitVector() && !Subtarget.hasInt256())
20581 return Lower256IntArith(Op, DAG);
20583 SDValue A = Op.getOperand(0);
20584 SDValue B = Op.getOperand(1);
20586 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
20587 // vector pairs, multiply and truncate.
20588 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
20589 if (Subtarget.hasInt256()) {
20590 // For 512-bit vectors, split into 256-bit vectors to allow the
20591 // sign-extension to occur.
20592 if (VT == MVT::v64i8)
20593 return Lower512IntArith(Op, DAG);
20595 // For 256-bit vectors, split into 128-bit vectors to allow the
20596 // sign-extension to occur. We don't need this on AVX512BW as we can
20597 // safely sign-extend to v32i16.
20598 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
20599 return Lower256IntArith(Op, DAG);
20601 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
20602 return DAG.getNode(
20603 ISD::TRUNCATE, dl, VT,
20604 DAG.getNode(ISD::MUL, dl, ExVT,
20605 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
20606 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
20609 assert(VT == MVT::v16i8 &&
20610 "Pre-AVX2 support only supports v16i8 multiplication");
20611 MVT ExVT = MVT::v8i16;
20613 // Extract the lo parts and sign extend to i16
20615 if (Subtarget.hasSSE41()) {
20616 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
20617 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
20619 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20620 -1, 4, -1, 5, -1, 6, -1, 7};
20621 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20622 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20623 ALo = DAG.getBitcast(ExVT, ALo);
20624 BLo = DAG.getBitcast(ExVT, BLo);
20625 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20626 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20629 // Extract the hi parts and sign extend to i16
20631 if (Subtarget.hasSSE41()) {
20632 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20633 -1, -1, -1, -1, -1, -1, -1, -1};
20634 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20635 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20636 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
20637 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
20639 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20640 -1, 12, -1, 13, -1, 14, -1, 15};
20641 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20642 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20643 AHi = DAG.getBitcast(ExVT, AHi);
20644 BHi = DAG.getBitcast(ExVT, BHi);
20645 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20646 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20649 // Multiply, mask the lower 8bits of the lo/hi results and pack
20650 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20651 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20652 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
20653 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
20654 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20657 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
20658 if (VT == MVT::v4i32) {
20659 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
20660 "Should not custom lower when pmuldq is available!");
20662 // Extract the odd parts.
20663 static const int UnpackMask[] = { 1, -1, 3, -1 };
20664 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
20665 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
20667 // Multiply the even parts.
20668 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
20669 // Now multiply odd parts.
20670 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
20672 Evens = DAG.getBitcast(VT, Evens);
20673 Odds = DAG.getBitcast(VT, Odds);
20675 // Merge the two vectors back together with a shuffle. This expands into 2
20677 static const int ShufMask[] = { 0, 4, 2, 6 };
20678 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
20681 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
20682 "Only know how to lower V2I64/V4I64/V8I64 multiply");
20684 // 32-bit vector types used for MULDQ/MULUDQ.
20685 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20687 // MULDQ returns the 64-bit result of the signed multiplication of the lower
20688 // 32-bits. We can lower with this if the sign bits stretch that far.
20689 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
20690 DAG.ComputeNumSignBits(B) > 32) {
20691 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
20692 DAG.getBitcast(MulVT, B));
20695 // Ahi = psrlqi(a, 32);
20696 // Bhi = psrlqi(b, 32);
20698 // AloBlo = pmuludq(a, b);
20699 // AloBhi = pmuludq(a, Bhi);
20700 // AhiBlo = pmuludq(Ahi, b);
20702 // Hi = psllqi(AloBhi + AhiBlo, 32);
20703 // return AloBlo + Hi;
20704 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
20705 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
20706 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
20708 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
20709 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
20710 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
20712 // Bit cast to 32-bit vectors for MULUDQ.
20713 SDValue Alo = DAG.getBitcast(MulVT, A);
20714 SDValue Blo = DAG.getBitcast(MulVT, B);
20716 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20718 // Only multiply lo/hi halves that aren't known to be zero.
20719 SDValue AloBlo = Zero;
20720 if (!ALoIsZero && !BLoIsZero)
20721 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
20723 SDValue AloBhi = Zero;
20724 if (!ALoIsZero && !BHiIsZero) {
20725 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
20726 Bhi = DAG.getBitcast(MulVT, Bhi);
20727 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
20730 SDValue AhiBlo = Zero;
20731 if (!AHiIsZero && !BLoIsZero) {
20732 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
20733 Ahi = DAG.getBitcast(MulVT, Ahi);
20734 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
20737 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
20738 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
20740 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
20743 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
20744 SelectionDAG &DAG) {
20746 MVT VT = Op.getSimpleValueType();
20748 // Decompose 256-bit ops into smaller 128-bit ops.
20749 if (VT.is256BitVector() && !Subtarget.hasInt256())
20750 return Lower256IntArith(Op, DAG);
20752 // Only i8 vectors should need custom lowering after this.
20753 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
20754 "Unsupported vector type");
20756 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
20757 // logical shift down the upper half and pack back to i8.
20758 SDValue A = Op.getOperand(0);
20759 SDValue B = Op.getOperand(1);
20761 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
20762 // and then ashr/lshr the upper bits down to the lower bits before multiply.
20763 unsigned Opcode = Op.getOpcode();
20764 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
20765 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
20767 // AVX2 implementations - extend xmm subvectors to ymm.
20768 if (Subtarget.hasInt256()) {
20769 SDValue Lo = DAG.getIntPtrConstant(0, dl);
20770 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
20772 if (VT == MVT::v32i8) {
20773 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
20774 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
20775 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
20776 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
20777 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
20778 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
20779 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
20780 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
20781 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20782 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
20783 DAG.getConstant(8, dl, MVT::v16i16));
20784 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
20785 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
20786 DAG.getConstant(8, dl, MVT::v16i16));
20787 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
20788 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
20789 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
20790 16, 17, 18, 19, 20, 21, 22, 23};
20791 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20792 24, 25, 26, 27, 28, 29, 30, 31};
20793 return DAG.getNode(X86ISD::PACKUS, dl, VT,
20794 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
20795 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
20798 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
20799 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
20800 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
20801 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
20802 DAG.getConstant(8, dl, MVT::v16i16));
20803 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
20804 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
20805 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
20808 assert(VT == MVT::v16i8 &&
20809 "Pre-AVX2 support only supports v16i8 multiplication");
20810 MVT ExVT = MVT::v8i16;
20812 // Extract the lo parts and zero/sign extend to i16.
20814 if (Subtarget.hasSSE41()) {
20815 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
20816 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
20818 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20819 -1, 4, -1, 5, -1, 6, -1, 7};
20820 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20821 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20822 ALo = DAG.getBitcast(ExVT, ALo);
20823 BLo = DAG.getBitcast(ExVT, BLo);
20824 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20825 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20828 // Extract the hi parts and zero/sign extend to i16.
20830 if (Subtarget.hasSSE41()) {
20831 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20832 -1, -1, -1, -1, -1, -1, -1, -1};
20833 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20834 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20835 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
20836 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
20838 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20839 -1, 12, -1, 13, -1, 14, -1, 15};
20840 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20841 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20842 AHi = DAG.getBitcast(ExVT, AHi);
20843 BHi = DAG.getBitcast(ExVT, BHi);
20844 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20845 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20848 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
20849 // pack back to v16i8.
20850 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20851 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20852 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
20853 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
20854 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20857 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
20858 assert(Subtarget.isTargetWin64() && "Unexpected target");
20859 EVT VT = Op.getValueType();
20860 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
20861 "Unexpected return type for lowering");
20865 switch (Op->getOpcode()) {
20866 default: llvm_unreachable("Unexpected request for libcall!");
20867 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
20868 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
20869 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
20870 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
20871 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
20872 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
20876 SDValue InChain = DAG.getEntryNode();
20878 TargetLowering::ArgListTy Args;
20879 TargetLowering::ArgListEntry Entry;
20880 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
20881 EVT ArgVT = Op->getOperand(i).getValueType();
20882 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
20883 "Unexpected argument type for lowering");
20884 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
20885 Entry.Node = StackPtr;
20886 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
20887 MachinePointerInfo(), /* Alignment = */ 16);
20888 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
20889 Entry.Ty = PointerType::get(ArgTy,0);
20890 Entry.isSExt = false;
20891 Entry.isZExt = false;
20892 Args.push_back(Entry);
20895 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
20896 getPointerTy(DAG.getDataLayout()));
20898 TargetLowering::CallLoweringInfo CLI(DAG);
20899 CLI.setDebugLoc(dl).setChain(InChain)
20900 .setCallee(getLibcallCallingConv(LC),
20901 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
20902 Callee, std::move(Args))
20903 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
20905 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20906 return DAG.getBitcast(VT, CallInfo.first);
20909 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
20910 SelectionDAG &DAG) {
20911 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
20912 MVT VT = Op0.getSimpleValueType();
20915 // Decompose 256-bit ops into smaller 128-bit ops.
20916 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20917 unsigned Opcode = Op.getOpcode();
20918 unsigned NumElems = VT.getVectorNumElements();
20919 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
20920 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
20921 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
20922 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
20923 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
20924 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
20925 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
20927 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
20928 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
20930 return DAG.getMergeValues(Ops, dl);
20933 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
20934 (VT == MVT::v8i32 && Subtarget.hasInt256()));
20936 // PMULxD operations multiply each even value (starting at 0) of LHS with
20937 // the related value of RHS and produce a widen result.
20938 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
20939 // => <2 x i64> <ae|cg>
20941 // In other word, to have all the results, we need to perform two PMULxD:
20942 // 1. one with the even values.
20943 // 2. one with the odd values.
20944 // To achieve #2, with need to place the odd values at an even position.
20946 // Place the odd value at an even position (basically, shift all values 1
20947 // step to the left):
20948 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
20949 // <a|b|c|d> => <b|undef|d|undef>
20950 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
20951 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
20952 // <e|f|g|h> => <f|undef|h|undef>
20953 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
20954 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
20956 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
20958 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
20959 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
20961 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
20962 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
20963 // => <2 x i64> <ae|cg>
20964 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
20965 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
20966 // => <2 x i64> <bf|dh>
20967 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
20969 // Shuffle it back into the right order.
20970 SDValue Highs, Lows;
20971 if (VT == MVT::v8i32) {
20972 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
20973 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
20974 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
20975 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
20977 const int HighMask[] = {1, 5, 3, 7};
20978 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
20979 const int LowMask[] = {0, 4, 2, 6};
20980 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
20983 // If we have a signed multiply but no PMULDQ fix up the high parts of a
20984 // unsigned multiply.
20985 if (IsSigned && !Subtarget.hasSSE41()) {
20986 SDValue ShAmt = DAG.getConstant(
20988 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
20989 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
20990 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
20991 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
20992 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
20994 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
20995 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
20998 // The first result of MUL_LOHI is actually the low value, followed by the
21000 SDValue Ops[] = {Lows, Highs};
21001 return DAG.getMergeValues(Ops, dl);
21004 // Return true if the required (according to Opcode) shift-imm form is natively
21005 // supported by the Subtarget
21006 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21008 if (VT.getScalarSizeInBits() < 16)
21011 if (VT.is512BitVector() &&
21012 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21015 bool LShift = VT.is128BitVector() ||
21016 (VT.is256BitVector() && Subtarget.hasInt256());
21018 bool AShift = LShift && (Subtarget.hasVLX() ||
21019 (VT != MVT::v2i64 && VT != MVT::v4i64));
21020 return (Opcode == ISD::SRA) ? AShift : LShift;
21023 // The shift amount is a variable, but it is the same for all vector lanes.
21024 // These instructions are defined together with shift-immediate.
21026 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21028 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21031 // Return true if the required (according to Opcode) variable-shift form is
21032 // natively supported by the Subtarget
21033 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21036 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21039 // vXi16 supported only on AVX-512, BWI
21040 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21043 if (VT.is512BitVector() || Subtarget.hasVLX())
21046 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21047 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21048 return (Opcode == ISD::SRA) ? AShift : LShift;
21051 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21052 const X86Subtarget &Subtarget) {
21053 MVT VT = Op.getSimpleValueType();
21055 SDValue R = Op.getOperand(0);
21056 SDValue Amt = Op.getOperand(1);
21058 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21059 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21061 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21062 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21063 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21064 SDValue Ex = DAG.getBitcast(ExVT, R);
21066 if (ShiftAmt >= 32) {
21067 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21069 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21070 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21071 ShiftAmt - 32, DAG);
21072 if (VT == MVT::v2i64)
21073 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21074 if (VT == MVT::v4i64)
21075 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21076 {9, 1, 11, 3, 13, 5, 15, 7});
21078 // SRA upper i32, SHL whole i64 and select lower i32.
21079 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21082 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21083 Lower = DAG.getBitcast(ExVT, Lower);
21084 if (VT == MVT::v2i64)
21085 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21086 if (VT == MVT::v4i64)
21087 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21088 {8, 1, 10, 3, 12, 5, 14, 7});
21090 return DAG.getBitcast(VT, Ex);
21093 // Optimize shl/srl/sra with constant shift amount.
21094 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21095 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21096 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21098 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21099 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21101 // i64 SRA needs to be performed as partial shifts.
21102 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21103 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21104 return ArithmeticShiftRight64(ShiftAmt);
21106 if (VT == MVT::v16i8 ||
21107 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21108 VT == MVT::v64i8) {
21109 unsigned NumElts = VT.getVectorNumElements();
21110 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21112 // Simple i8 add case
21113 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21114 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21116 // ashr(R, 7) === cmp_slt(R, 0)
21117 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21118 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21119 if (VT.is512BitVector()) {
21120 assert(VT == MVT::v64i8 && "Unexpected element type!");
21121 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21122 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21124 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21127 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21128 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21131 if (Op.getOpcode() == ISD::SHL) {
21132 // Make a large shift.
21133 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21135 SHL = DAG.getBitcast(VT, SHL);
21136 // Zero out the rightmost bits.
21137 return DAG.getNode(ISD::AND, dl, VT, SHL,
21138 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21140 if (Op.getOpcode() == ISD::SRL) {
21141 // Make a large shift.
21142 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21144 SRL = DAG.getBitcast(VT, SRL);
21145 // Zero out the leftmost bits.
21146 return DAG.getNode(ISD::AND, dl, VT, SRL,
21147 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21149 if (Op.getOpcode() == ISD::SRA) {
21150 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21151 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21153 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21154 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21155 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21158 llvm_unreachable("Unknown shift opcode.");
21163 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21164 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21165 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21166 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21168 // Peek through any splat that was introduced for i64 shift vectorization.
21169 int SplatIndex = -1;
21170 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21171 if (SVN->isSplat()) {
21172 SplatIndex = SVN->getSplatIndex();
21173 Amt = Amt.getOperand(0);
21174 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21175 "Splat shuffle referencing second operand");
21178 if (Amt.getOpcode() != ISD::BITCAST ||
21179 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21182 Amt = Amt.getOperand(0);
21183 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21184 VT.getVectorNumElements();
21185 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21186 uint64_t ShiftAmt = 0;
21187 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21188 for (unsigned i = 0; i != Ratio; ++i) {
21189 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21193 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21196 // Check remaining shift amounts (if not a splat).
21197 if (SplatIndex < 0) {
21198 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21199 uint64_t ShAmt = 0;
21200 for (unsigned j = 0; j != Ratio; ++j) {
21201 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21205 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21207 if (ShAmt != ShiftAmt)
21212 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21213 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21215 if (Op.getOpcode() == ISD::SRA)
21216 return ArithmeticShiftRight64(ShiftAmt);
21222 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21223 const X86Subtarget &Subtarget) {
21224 MVT VT = Op.getSimpleValueType();
21226 SDValue R = Op.getOperand(0);
21227 SDValue Amt = Op.getOperand(1);
21229 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21230 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21232 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21233 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21235 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21237 MVT EltVT = VT.getVectorElementType();
21239 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21240 // Check if this build_vector node is doing a splat.
21241 // If so, then set BaseShAmt equal to the splat value.
21242 BaseShAmt = BV->getSplatValue();
21243 if (BaseShAmt && BaseShAmt.isUndef())
21244 BaseShAmt = SDValue();
21246 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21247 Amt = Amt.getOperand(0);
21249 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21250 if (SVN && SVN->isSplat()) {
21251 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21252 SDValue InVec = Amt.getOperand(0);
21253 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21254 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21255 "Unexpected shuffle index found!");
21256 BaseShAmt = InVec.getOperand(SplatIdx);
21257 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21258 if (ConstantSDNode *C =
21259 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21260 if (C->getZExtValue() == SplatIdx)
21261 BaseShAmt = InVec.getOperand(1);
21266 // Avoid introducing an extract element from a shuffle.
21267 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21268 DAG.getIntPtrConstant(SplatIdx, dl));
21272 if (BaseShAmt.getNode()) {
21273 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
21274 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
21275 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
21276 else if (EltVT.bitsLT(MVT::i32))
21277 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
21279 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, DAG);
21283 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21284 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
21285 Amt.getOpcode() == ISD::BITCAST &&
21286 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
21287 Amt = Amt.getOperand(0);
21288 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21289 VT.getVectorNumElements();
21290 std::vector<SDValue> Vals(Ratio);
21291 for (unsigned i = 0; i != Ratio; ++i)
21292 Vals[i] = Amt.getOperand(i);
21293 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21294 for (unsigned j = 0; j != Ratio; ++j)
21295 if (Vals[j] != Amt.getOperand(i + j))
21299 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
21300 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
21305 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
21306 SelectionDAG &DAG) {
21307 MVT VT = Op.getSimpleValueType();
21309 SDValue R = Op.getOperand(0);
21310 SDValue Amt = Op.getOperand(1);
21311 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21313 assert(VT.isVector() && "Custom lowering only for vector shifts!");
21314 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
21316 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
21319 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
21322 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
21325 // XOP has 128-bit variable logical/arithmetic shifts.
21326 // +ve/-ve Amt = shift left/right.
21327 if (Subtarget.hasXOP() &&
21328 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
21329 VT == MVT::v8i16 || VT == MVT::v16i8)) {
21330 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
21331 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21332 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
21334 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
21335 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
21336 if (Op.getOpcode() == ISD::SRA)
21337 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
21340 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
21341 // shifts per-lane and then shuffle the partial results back together.
21342 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
21343 // Splat the shift amounts so the scalar shifts above will catch it.
21344 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
21345 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
21346 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
21347 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
21348 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
21351 // i64 vector arithmetic shift can be emulated with the transform:
21352 // M = lshr(SIGN_BIT, Amt)
21353 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
21354 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
21355 Op.getOpcode() == ISD::SRA) {
21356 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
21357 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
21358 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21359 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
21360 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
21364 // If possible, lower this packed shift into a vector multiply instead of
21365 // expanding it into a sequence of scalar shifts.
21366 // Do this only if the vector shift count is a constant build_vector.
21367 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
21368 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
21369 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
21370 SmallVector<SDValue, 8> Elts;
21371 MVT SVT = VT.getVectorElementType();
21372 unsigned SVTBits = SVT.getSizeInBits();
21373 APInt One(SVTBits, 1);
21374 unsigned NumElems = VT.getVectorNumElements();
21376 for (unsigned i=0; i !=NumElems; ++i) {
21377 SDValue Op = Amt->getOperand(i);
21378 if (Op->isUndef()) {
21379 Elts.push_back(Op);
21383 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
21384 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
21385 uint64_t ShAmt = C.getZExtValue();
21386 if (ShAmt >= SVTBits) {
21387 Elts.push_back(DAG.getUNDEF(SVT));
21390 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
21392 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
21393 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
21396 // Lower SHL with variable shift amount.
21397 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
21398 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
21400 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
21401 DAG.getConstant(0x3f800000U, dl, VT));
21402 Op = DAG.getBitcast(MVT::v4f32, Op);
21403 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
21404 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
21407 // If possible, lower this shift as a sequence of two shifts by
21408 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
21410 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
21412 // Could be rewritten as:
21413 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
21415 // The advantage is that the two shifts from the example would be
21416 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
21417 // the vector shift into four scalar shifts plus four pairs of vector
21419 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
21420 unsigned TargetOpcode = X86ISD::MOVSS;
21421 bool CanBeSimplified;
21422 // The splat value for the first packed shift (the 'X' from the example).
21423 SDValue Amt1 = Amt->getOperand(0);
21424 // The splat value for the second packed shift (the 'Y' from the example).
21425 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
21427 // See if it is possible to replace this node with a sequence of
21428 // two shifts followed by a MOVSS/MOVSD/PBLEND.
21429 if (VT == MVT::v4i32) {
21430 // Check if it is legal to use a MOVSS.
21431 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
21432 Amt2 == Amt->getOperand(3);
21433 if (!CanBeSimplified) {
21434 // Otherwise, check if we can still simplify this node using a MOVSD.
21435 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
21436 Amt->getOperand(2) == Amt->getOperand(3);
21437 TargetOpcode = X86ISD::MOVSD;
21438 Amt2 = Amt->getOperand(2);
21441 // Do similar checks for the case where the machine value type
21443 CanBeSimplified = Amt1 == Amt->getOperand(1);
21444 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
21445 CanBeSimplified = Amt2 == Amt->getOperand(i);
21447 if (!CanBeSimplified) {
21448 TargetOpcode = X86ISD::MOVSD;
21449 CanBeSimplified = true;
21450 Amt2 = Amt->getOperand(4);
21451 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
21452 CanBeSimplified = Amt1 == Amt->getOperand(i);
21453 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
21454 CanBeSimplified = Amt2 == Amt->getOperand(j);
21458 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
21459 isa<ConstantSDNode>(Amt2)) {
21460 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
21461 MVT CastVT = MVT::v4i32;
21463 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
21464 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
21466 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
21467 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
21468 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
21469 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
21470 if (TargetOpcode == X86ISD::MOVSD)
21471 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21472 BitCast2, {0, 1, 6, 7}));
21473 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21474 BitCast2, {0, 5, 6, 7}));
21478 // v4i32 Non Uniform Shifts.
21479 // If the shift amount is constant we can shift each lane using the SSE2
21480 // immediate shifts, else we need to zero-extend each lane to the lower i64
21481 // and shift using the SSE2 variable shifts.
21482 // The separate results can then be blended together.
21483 if (VT == MVT::v4i32) {
21484 unsigned Opc = Op.getOpcode();
21485 SDValue Amt0, Amt1, Amt2, Amt3;
21487 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
21488 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
21489 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
21490 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
21492 // ISD::SHL is handled above but we include it here for completeness.
21495 llvm_unreachable("Unknown target vector shift node");
21497 Opc = X86ISD::VSHL;
21500 Opc = X86ISD::VSRL;
21503 Opc = X86ISD::VSRA;
21506 // The SSE2 shifts use the lower i64 as the same shift amount for
21507 // all lanes and the upper i64 is ignored. These shuffle masks
21508 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
21509 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21510 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
21511 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
21512 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
21513 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
21516 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
21517 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
21518 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
21519 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
21520 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
21521 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
21522 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
21525 if (VT == MVT::v16i8 ||
21526 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP())) {
21527 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
21528 unsigned ShiftOpcode = Op->getOpcode();
21530 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
21531 // On SSE41 targets we make use of the fact that VSELECT lowers
21532 // to PBLENDVB which selects bytes based just on the sign bit.
21533 if (Subtarget.hasSSE41()) {
21534 V0 = DAG.getBitcast(VT, V0);
21535 V1 = DAG.getBitcast(VT, V1);
21536 Sel = DAG.getBitcast(VT, Sel);
21537 return DAG.getBitcast(SelVT,
21538 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
21540 // On pre-SSE41 targets we test for the sign bit by comparing to
21541 // zero - a negative value will set all bits of the lanes to true
21542 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
21543 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
21544 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
21545 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
21548 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
21549 // We can safely do this using i16 shifts as we're only interested in
21550 // the 3 lower bits of each byte.
21551 Amt = DAG.getBitcast(ExtVT, Amt);
21552 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
21553 Amt = DAG.getBitcast(VT, Amt);
21555 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
21556 // r = VSELECT(r, shift(r, 4), a);
21558 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21559 R = SignBitSelect(VT, Amt, M, R);
21562 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21564 // r = VSELECT(r, shift(r, 2), a);
21565 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21566 R = SignBitSelect(VT, Amt, M, R);
21569 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21571 // return VSELECT(r, shift(r, 1), a);
21572 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21573 R = SignBitSelect(VT, Amt, M, R);
21577 if (Op->getOpcode() == ISD::SRA) {
21578 // For SRA we need to unpack each byte to the higher byte of a i16 vector
21579 // so we can correctly sign extend. We don't care what happens to the
21581 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
21582 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
21583 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
21584 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
21585 ALo = DAG.getBitcast(ExtVT, ALo);
21586 AHi = DAG.getBitcast(ExtVT, AHi);
21587 RLo = DAG.getBitcast(ExtVT, RLo);
21588 RHi = DAG.getBitcast(ExtVT, RHi);
21590 // r = VSELECT(r, shift(r, 4), a);
21591 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21592 DAG.getConstant(4, dl, ExtVT));
21593 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21594 DAG.getConstant(4, dl, ExtVT));
21595 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21596 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21599 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21600 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21602 // r = VSELECT(r, shift(r, 2), a);
21603 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21604 DAG.getConstant(2, dl, ExtVT));
21605 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21606 DAG.getConstant(2, dl, ExtVT));
21607 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21608 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21611 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21612 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21614 // r = VSELECT(r, shift(r, 1), a);
21615 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21616 DAG.getConstant(1, dl, ExtVT));
21617 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21618 DAG.getConstant(1, dl, ExtVT));
21619 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21620 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21622 // Logical shift the result back to the lower byte, leaving a zero upper
21624 // meaning that we can safely pack with PACKUSWB.
21626 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
21628 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
21629 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21633 // It's worth extending once and using the v8i32 shifts for 16-bit types, but
21634 // the extra overheads to get from v16i8 to v8i32 make the existing SSE
21635 // solution better.
21636 if (Subtarget.hasInt256() && VT == MVT::v8i16) {
21637 MVT ExtVT = MVT::v8i32;
21639 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21640 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
21641 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
21642 return DAG.getNode(ISD::TRUNCATE, dl, VT,
21643 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
21646 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
21647 MVT ExtVT = MVT::v8i32;
21648 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21649 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
21650 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
21651 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
21652 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
21653 ALo = DAG.getBitcast(ExtVT, ALo);
21654 AHi = DAG.getBitcast(ExtVT, AHi);
21655 RLo = DAG.getBitcast(ExtVT, RLo);
21656 RHi = DAG.getBitcast(ExtVT, RHi);
21657 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
21658 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
21659 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
21660 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
21661 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21664 if (VT == MVT::v8i16) {
21665 unsigned ShiftOpcode = Op->getOpcode();
21667 // If we have a constant shift amount, the non-SSE41 path is best as
21668 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
21669 bool UseSSE41 = Subtarget.hasSSE41() &&
21670 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21672 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
21673 // On SSE41 targets we make use of the fact that VSELECT lowers
21674 // to PBLENDVB which selects bytes based just on the sign bit.
21676 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
21677 V0 = DAG.getBitcast(ExtVT, V0);
21678 V1 = DAG.getBitcast(ExtVT, V1);
21679 Sel = DAG.getBitcast(ExtVT, Sel);
21680 return DAG.getBitcast(
21681 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
21683 // On pre-SSE41 targets we splat the sign bit - a negative value will
21684 // set all bits of the lanes to true and VSELECT uses that in
21685 // its OR(AND(V0,C),AND(V1,~C)) lowering.
21687 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
21688 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
21691 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
21693 // On SSE41 targets we need to replicate the shift mask in both
21694 // bytes for PBLENDVB.
21697 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
21698 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
21700 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
21703 // r = VSELECT(r, shift(r, 8), a);
21704 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
21705 R = SignBitSelect(Amt, M, R);
21708 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21710 // r = VSELECT(r, shift(r, 4), a);
21711 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21712 R = SignBitSelect(Amt, M, R);
21715 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21717 // r = VSELECT(r, shift(r, 2), a);
21718 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21719 R = SignBitSelect(Amt, M, R);
21722 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21724 // return VSELECT(r, shift(r, 1), a);
21725 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21726 R = SignBitSelect(Amt, M, R);
21730 // Decompose 256-bit shifts into smaller 128-bit shifts.
21731 if (VT.is256BitVector())
21732 return Lower256IntArith(Op, DAG);
21737 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
21738 SelectionDAG &DAG) {
21739 MVT VT = Op.getSimpleValueType();
21741 SDValue R = Op.getOperand(0);
21742 SDValue Amt = Op.getOperand(1);
21744 assert(VT.isVector() && "Custom lowering only for vector rotates!");
21745 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
21746 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
21748 // XOP has 128-bit vector variable + immediate rotates.
21749 // +ve/-ve Amt = rotate left/right.
21751 // Split 256-bit integers.
21752 if (VT.is256BitVector())
21753 return Lower256IntArith(Op, DAG);
21755 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
21757 // Attempt to rotate by immediate.
21758 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21759 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
21760 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
21761 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
21762 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
21763 DAG.getConstant(RotateAmt, DL, MVT::i8));
21767 // Use general rotate by variable (per-element).
21768 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
21771 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
21772 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
21773 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
21774 // looks for this combo and may remove the "setcc" instruction if the "setcc"
21775 // has only one use.
21776 SDNode *N = Op.getNode();
21777 SDValue LHS = N->getOperand(0);
21778 SDValue RHS = N->getOperand(1);
21779 unsigned BaseOp = 0;
21780 X86::CondCode Cond;
21782 switch (Op.getOpcode()) {
21783 default: llvm_unreachable("Unknown ovf instruction!");
21785 // A subtract of one will be selected as a INC. Note that INC doesn't
21786 // set CF, so we can't do this for UADDO.
21787 if (isOneConstant(RHS)) {
21788 BaseOp = X86ISD::INC;
21789 Cond = X86::COND_O;
21792 BaseOp = X86ISD::ADD;
21793 Cond = X86::COND_O;
21796 BaseOp = X86ISD::ADD;
21797 Cond = X86::COND_B;
21800 // A subtract of one will be selected as a DEC. Note that DEC doesn't
21801 // set CF, so we can't do this for USUBO.
21802 if (isOneConstant(RHS)) {
21803 BaseOp = X86ISD::DEC;
21804 Cond = X86::COND_O;
21807 BaseOp = X86ISD::SUB;
21808 Cond = X86::COND_O;
21811 BaseOp = X86ISD::SUB;
21812 Cond = X86::COND_B;
21815 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
21816 Cond = X86::COND_O;
21818 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
21819 if (N->getValueType(0) == MVT::i8) {
21820 BaseOp = X86ISD::UMUL8;
21821 Cond = X86::COND_O;
21824 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
21826 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
21828 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
21830 if (N->getValueType(1) == MVT::i1)
21831 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21833 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21837 // Also sets EFLAGS.
21838 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
21839 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
21841 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
21843 if (N->getValueType(1) == MVT::i1)
21844 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
21846 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
21849 /// Returns true if the operand type is exactly twice the native width, and
21850 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
21851 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
21852 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
21853 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
21854 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
21857 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
21858 else if (OpWidth == 128)
21859 return Subtarget.hasCmpxchg16b();
21864 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
21865 return needsCmpXchgNb(SI->getValueOperand()->getType());
21868 // Note: this turns large loads into lock cmpxchg8b/16b.
21869 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
21870 TargetLowering::AtomicExpansionKind
21871 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
21872 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
21873 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
21874 : AtomicExpansionKind::None;
21877 TargetLowering::AtomicExpansionKind
21878 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
21879 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21880 Type *MemType = AI->getType();
21882 // If the operand is too big, we must see if cmpxchg8/16b is available
21883 // and default to library calls otherwise.
21884 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
21885 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
21886 : AtomicExpansionKind::None;
21889 AtomicRMWInst::BinOp Op = AI->getOperation();
21892 llvm_unreachable("Unknown atomic operation");
21893 case AtomicRMWInst::Xchg:
21894 case AtomicRMWInst::Add:
21895 case AtomicRMWInst::Sub:
21896 // It's better to use xadd, xsub or xchg for these in all cases.
21897 return AtomicExpansionKind::None;
21898 case AtomicRMWInst::Or:
21899 case AtomicRMWInst::And:
21900 case AtomicRMWInst::Xor:
21901 // If the atomicrmw's result isn't actually used, we can just add a "lock"
21902 // prefix to a normal instruction for these operations.
21903 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
21904 : AtomicExpansionKind::None;
21905 case AtomicRMWInst::Nand:
21906 case AtomicRMWInst::Max:
21907 case AtomicRMWInst::Min:
21908 case AtomicRMWInst::UMax:
21909 case AtomicRMWInst::UMin:
21910 // These always require a non-trivial set of data operations on x86. We must
21911 // use a cmpxchg loop.
21912 return AtomicExpansionKind::CmpXChg;
21917 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
21918 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
21919 Type *MemType = AI->getType();
21920 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
21921 // there is no benefit in turning such RMWs into loads, and it is actually
21922 // harmful as it introduces a mfence.
21923 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
21926 auto Builder = IRBuilder<>(AI);
21927 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21928 auto SynchScope = AI->getSynchScope();
21929 // We must restrict the ordering to avoid generating loads with Release or
21930 // ReleaseAcquire orderings.
21931 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
21932 auto Ptr = AI->getPointerOperand();
21934 // Before the load we need a fence. Here is an example lifted from
21935 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
21938 // x.store(1, relaxed);
21939 // r1 = y.fetch_add(0, release);
21941 // y.fetch_add(42, acquire);
21942 // r2 = x.load(relaxed);
21943 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
21944 // lowered to just a load without a fence. A mfence flushes the store buffer,
21945 // making the optimization clearly correct.
21946 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
21947 // otherwise, we might be able to be more aggressive on relaxed idempotent
21948 // rmw. In practice, they do not look useful, so we don't try to be
21949 // especially clever.
21950 if (SynchScope == SingleThread)
21951 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
21952 // the IR level, so we must wrap it in an intrinsic.
21955 if (!Subtarget.hasMFence())
21956 // FIXME: it might make sense to use a locked operation here but on a
21957 // different cache-line to prevent cache-line bouncing. In practice it
21958 // is probably a small win, and x86 processors without mfence are rare
21959 // enough that we do not bother.
21963 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
21964 Builder.CreateCall(MFence, {});
21966 // Finally we can emit the atomic load.
21967 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
21968 AI->getType()->getPrimitiveSizeInBits());
21969 Loaded->setAtomic(Order, SynchScope);
21970 AI->replaceAllUsesWith(Loaded);
21971 AI->eraseFromParent();
21975 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
21976 SelectionDAG &DAG) {
21978 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
21979 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
21980 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
21981 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
21983 // The only fence that needs an instruction is a sequentially-consistent
21984 // cross-thread fence.
21985 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
21986 FenceScope == CrossThread) {
21987 if (Subtarget.hasMFence())
21988 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
21990 SDValue Chain = Op.getOperand(0);
21991 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
21993 DAG.getRegister(X86::ESP, MVT::i32), // Base
21994 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
21995 DAG.getRegister(0, MVT::i32), // Index
21996 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
21997 DAG.getRegister(0, MVT::i32), // Segment.
22001 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22002 return SDValue(Res, 0);
22005 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22006 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22009 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22010 SelectionDAG &DAG) {
22011 MVT T = Op.getSimpleValueType();
22015 switch(T.SimpleTy) {
22016 default: llvm_unreachable("Invalid value type!");
22017 case MVT::i8: Reg = X86::AL; size = 1; break;
22018 case MVT::i16: Reg = X86::AX; size = 2; break;
22019 case MVT::i32: Reg = X86::EAX; size = 4; break;
22021 assert(Subtarget.is64Bit() && "Node not type legal!");
22022 Reg = X86::RAX; size = 8;
22025 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22026 Op.getOperand(2), SDValue());
22027 SDValue Ops[] = { cpIn.getValue(0),
22030 DAG.getTargetConstant(size, DL, MVT::i8),
22031 cpIn.getValue(1) };
22032 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22033 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22034 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22038 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22039 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22040 MVT::i32, cpOut.getValue(2));
22041 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22043 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22044 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22045 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22049 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22050 SelectionDAG &DAG) {
22051 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22052 MVT DstVT = Op.getSimpleValueType();
22054 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22055 SrcVT == MVT::i64) {
22056 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22057 if (DstVT != MVT::f64)
22058 // This conversion needs to be expanded.
22061 SDValue Op0 = Op->getOperand(0);
22062 SmallVector<SDValue, 16> Elts;
22066 if (SrcVT.isVector()) {
22067 NumElts = SrcVT.getVectorNumElements();
22068 SVT = SrcVT.getVectorElementType();
22070 // Widen the vector in input in the case of MVT::v2i32.
22071 // Example: from MVT::v2i32 to MVT::v4i32.
22072 for (unsigned i = 0, e = NumElts; i != e; ++i)
22073 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22074 DAG.getIntPtrConstant(i, dl)));
22076 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22077 "Unexpected source type in LowerBITCAST");
22078 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22079 DAG.getIntPtrConstant(0, dl)));
22080 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22081 DAG.getIntPtrConstant(1, dl)));
22085 // Explicitly mark the extra elements as Undef.
22086 Elts.append(NumElts, DAG.getUNDEF(SVT));
22088 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22089 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22090 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22091 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22092 DAG.getIntPtrConstant(0, dl));
22095 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22096 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22097 assert((DstVT == MVT::i64 ||
22098 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22099 "Unexpected custom BITCAST");
22100 // i64 <=> MMX conversions are Legal.
22101 if (SrcVT==MVT::i64 && DstVT.isVector())
22103 if (DstVT==MVT::i64 && SrcVT.isVector())
22105 // MMX <=> MMX conversions are Legal.
22106 if (SrcVT.isVector() && DstVT.isVector())
22108 // All other conversions need to be expanded.
22112 /// Compute the horizontal sum of bytes in V for the elements of VT.
22114 /// Requires V to be a byte vector and VT to be an integer vector type with
22115 /// wider elements than V's type. The width of the elements of VT determines
22116 /// how many bytes of V are summed horizontally to produce each element of the
22118 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22119 const X86Subtarget &Subtarget,
22120 SelectionDAG &DAG) {
22122 MVT ByteVecVT = V.getSimpleValueType();
22123 MVT EltVT = VT.getVectorElementType();
22124 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22125 "Expected value to have byte element type.");
22126 assert(EltVT != MVT::i8 &&
22127 "Horizontal byte sum only makes sense for wider elements!");
22128 unsigned VecSize = VT.getSizeInBits();
22129 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22131 // PSADBW instruction horizontally add all bytes and leave the result in i64
22132 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22133 if (EltVT == MVT::i64) {
22134 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22135 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22136 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22137 return DAG.getBitcast(VT, V);
22140 if (EltVT == MVT::i32) {
22141 // We unpack the low half and high half into i32s interleaved with zeros so
22142 // that we can use PSADBW to horizontally sum them. The most useful part of
22143 // this is that it lines up the results of two PSADBW instructions to be
22144 // two v2i64 vectors which concatenated are the 4 population counts. We can
22145 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22146 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22147 SDValue V32 = DAG.getBitcast(VT, V);
22148 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22149 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22151 // Do the horizontal sums into two v2i64s.
22152 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22153 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22154 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22155 DAG.getBitcast(ByteVecVT, Low), Zeros);
22156 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22157 DAG.getBitcast(ByteVecVT, High), Zeros);
22159 // Merge them together.
22160 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22161 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22162 DAG.getBitcast(ShortVecVT, Low),
22163 DAG.getBitcast(ShortVecVT, High));
22165 return DAG.getBitcast(VT, V);
22168 // The only element type left is i16.
22169 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22171 // To obtain pop count for each i16 element starting from the pop count for
22172 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22173 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22174 // directly supported.
22175 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22176 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22177 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22178 DAG.getBitcast(ByteVecVT, V));
22179 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22182 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22183 const X86Subtarget &Subtarget,
22184 SelectionDAG &DAG) {
22185 MVT VT = Op.getSimpleValueType();
22186 MVT EltVT = VT.getVectorElementType();
22187 unsigned VecSize = VT.getSizeInBits();
22189 // Implement a lookup table in register by using an algorithm based on:
22190 // http://wm.ite.pl/articles/sse-popcount.html
22192 // The general idea is that every lower byte nibble in the input vector is an
22193 // index into a in-register pre-computed pop count table. We then split up the
22194 // input vector in two new ones: (1) a vector with only the shifted-right
22195 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22196 // masked out higher ones) for each byte. PSHUB is used separately with both
22197 // to index the in-register table. Next, both are added and the result is a
22198 // i8 vector where each element contains the pop count for input byte.
22200 // To obtain the pop count for elements != i8, we follow up with the same
22201 // approach and use additional tricks as described below.
22203 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22204 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22205 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22206 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22208 int NumByteElts = VecSize / 8;
22209 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22210 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22211 SmallVector<SDValue, 64> LUTVec;
22212 for (int i = 0; i < NumByteElts; ++i)
22213 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22214 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22215 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22218 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22219 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22222 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22224 // The input vector is used as the shuffle mask that index elements into the
22225 // LUT. After counting low and high nibbles, add the vector to obtain the
22226 // final pop count per i8 element.
22227 SDValue HighPopCnt =
22228 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22229 SDValue LowPopCnt =
22230 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22231 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22233 if (EltVT == MVT::i8)
22236 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22239 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22240 const X86Subtarget &Subtarget,
22241 SelectionDAG &DAG) {
22242 MVT VT = Op.getSimpleValueType();
22243 assert(VT.is128BitVector() &&
22244 "Only 128-bit vector bitmath lowering supported.");
22246 int VecSize = VT.getSizeInBits();
22247 MVT EltVT = VT.getVectorElementType();
22248 int Len = EltVT.getSizeInBits();
22250 // This is the vectorized version of the "best" algorithm from
22251 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22252 // with a minor tweak to use a series of adds + shifts instead of vector
22253 // multiplications. Implemented for all integer vector types. We only use
22254 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22255 // much faster, even faster than using native popcnt instructions.
22257 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22258 MVT VT = V.getSimpleValueType();
22259 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22260 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
22262 auto GetMask = [&](SDValue V, APInt Mask) {
22263 MVT VT = V.getSimpleValueType();
22264 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
22265 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
22268 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
22269 // x86, so set the SRL type to have elements at least i16 wide. This is
22270 // correct because all of our SRLs are followed immediately by a mask anyways
22271 // that handles any bits that sneak into the high bits of the byte elements.
22272 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
22276 // v = v - ((v >> 1) & 0x55555555...)
22278 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
22279 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
22280 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
22282 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
22283 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
22284 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
22285 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
22286 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
22288 // v = (v + (v >> 4)) & 0x0F0F0F0F...
22289 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
22290 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
22291 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
22293 // At this point, V contains the byte-wise population count, and we are
22294 // merely doing a horizontal sum if necessary to get the wider element
22296 if (EltVT == MVT::i8)
22299 return LowerHorizontalByteSum(
22300 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
22304 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
22305 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
22306 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22307 SelectionDAG &DAG) {
22308 MVT VT = Op.getSimpleValueType();
22309 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
22310 "Unknown CTPOP type to handle");
22311 SDLoc DL(Op.getNode());
22312 SDValue Op0 = Op.getOperand(0);
22314 if (!Subtarget.hasSSSE3()) {
22315 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
22316 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
22317 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
22320 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22321 unsigned NumElems = VT.getVectorNumElements();
22323 // Extract each 128-bit vector, compute pop count and concat the result.
22324 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
22325 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
22327 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22328 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22329 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22332 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
22333 unsigned NumElems = VT.getVectorNumElements();
22335 // Extract each 256-bit vector, compute pop count and concat the result.
22336 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
22337 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
22339 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22340 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22341 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22344 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
22347 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22348 SelectionDAG &DAG) {
22349 assert(Op.getSimpleValueType().isVector() &&
22350 "We only do custom lowering for vector population count.");
22351 return LowerVectorCTPOP(Op, Subtarget, DAG);
22354 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
22355 MVT VT = Op.getSimpleValueType();
22356 SDValue In = Op.getOperand(0);
22359 // For scalars, its still beneficial to transfer to/from the SIMD unit to
22360 // perform the BITREVERSE.
22361 if (!VT.isVector()) {
22362 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
22363 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
22364 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
22365 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
22366 DAG.getIntPtrConstant(0, DL));
22369 MVT SVT = VT.getVectorElementType();
22370 int NumElts = VT.getVectorNumElements();
22371 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
22373 // Decompose 256-bit ops into smaller 128-bit ops.
22374 if (VT.is256BitVector()) {
22375 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22376 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22378 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
22379 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22380 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
22381 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
22384 assert(VT.is128BitVector() &&
22385 "Only 128-bit vector bitreverse lowering supported.");
22387 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
22388 // perform the BSWAP in the shuffle.
22389 // Its best to shuffle using the second operand as this will implicitly allow
22390 // memory folding for multiple vectors.
22391 SmallVector<SDValue, 16> MaskElts;
22392 for (int i = 0; i != NumElts; ++i) {
22393 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
22394 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
22395 int PermuteByte = SourceByte | (2 << 5);
22396 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
22400 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
22401 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
22402 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
22404 return DAG.getBitcast(VT, Res);
22407 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
22408 SelectionDAG &DAG) {
22409 if (Subtarget.hasXOP())
22410 return LowerBITREVERSE_XOP(Op, DAG);
22412 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
22414 MVT VT = Op.getSimpleValueType();
22415 SDValue In = Op.getOperand(0);
22418 unsigned NumElts = VT.getVectorNumElements();
22419 assert(VT.getScalarType() == MVT::i8 &&
22420 "Only byte vector BITREVERSE supported");
22422 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
22423 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22424 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
22425 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22426 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22427 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
22428 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
22429 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22432 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
22433 // two nibbles and a PSHUFB lookup to find the bitreverse of each
22434 // 0-15 value (moved to the other nibble).
22435 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
22436 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
22437 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
22439 const int LoLUT[16] = {
22440 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
22441 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
22442 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
22443 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
22444 const int HiLUT[16] = {
22445 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
22446 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
22447 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
22448 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
22450 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
22451 for (unsigned i = 0; i < NumElts; ++i) {
22452 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
22453 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
22456 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
22457 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
22458 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
22459 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
22460 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
22463 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
22464 unsigned NewOpc = 0;
22465 switch (N->getOpcode()) {
22466 case ISD::ATOMIC_LOAD_ADD:
22467 NewOpc = X86ISD::LADD;
22469 case ISD::ATOMIC_LOAD_SUB:
22470 NewOpc = X86ISD::LSUB;
22472 case ISD::ATOMIC_LOAD_OR:
22473 NewOpc = X86ISD::LOR;
22475 case ISD::ATOMIC_LOAD_XOR:
22476 NewOpc = X86ISD::LXOR;
22478 case ISD::ATOMIC_LOAD_AND:
22479 NewOpc = X86ISD::LAND;
22482 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
22485 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
22486 return DAG.getMemIntrinsicNode(
22487 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
22488 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
22489 /*MemVT=*/N->getSimpleValueType(0), MMO);
22492 /// Lower atomic_load_ops into LOCK-prefixed operations.
22493 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
22494 const X86Subtarget &Subtarget) {
22495 SDValue Chain = N->getOperand(0);
22496 SDValue LHS = N->getOperand(1);
22497 SDValue RHS = N->getOperand(2);
22498 unsigned Opc = N->getOpcode();
22499 MVT VT = N->getSimpleValueType(0);
22502 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
22503 // can only be lowered when the result is unused. They should have already
22504 // been transformed into a cmpxchg loop in AtomicExpand.
22505 if (N->hasAnyUseOfValue(0)) {
22506 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
22507 // select LXADD if LOCK_SUB can't be selected.
22508 if (Opc == ISD::ATOMIC_LOAD_SUB) {
22509 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
22510 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
22511 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
22512 RHS, AN->getMemOperand());
22514 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
22515 "Used AtomicRMW ops other than Add should have been expanded!");
22519 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
22520 // RAUW the chain, but don't worry about the result, as it's unused.
22521 assert(!N->hasAnyUseOfValue(0));
22522 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
22526 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
22527 SDNode *Node = Op.getNode();
22529 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
22531 // Convert seq_cst store -> xchg
22532 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
22533 // FIXME: On 32-bit, store -> fist or movq would be more efficient
22534 // (The only way to get a 16-byte store is cmpxchg16b)
22535 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
22536 if (cast<AtomicSDNode>(Node)->getOrdering() ==
22537 AtomicOrdering::SequentiallyConsistent ||
22538 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
22539 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
22540 cast<AtomicSDNode>(Node)->getMemoryVT(),
22541 Node->getOperand(0),
22542 Node->getOperand(1), Node->getOperand(2),
22543 cast<AtomicSDNode>(Node)->getMemOperand());
22544 return Swap.getValue(1);
22546 // Other atomic stores have a simple pattern.
22550 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
22551 MVT VT = Op.getNode()->getSimpleValueType(0);
22553 // Let legalize expand this if it isn't a legal type yet.
22554 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
22557 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22560 bool ExtraOp = false;
22561 switch (Op.getOpcode()) {
22562 default: llvm_unreachable("Invalid code");
22563 case ISD::ADDC: Opc = X86ISD::ADD; break;
22564 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
22565 case ISD::SUBC: Opc = X86ISD::SUB; break;
22566 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
22570 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22572 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22573 Op.getOperand(1), Op.getOperand(2));
22576 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
22577 SelectionDAG &DAG) {
22578 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
22580 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
22581 // which returns the values as { float, float } (in XMM0) or
22582 // { double, double } (which is returned in XMM0, XMM1).
22584 SDValue Arg = Op.getOperand(0);
22585 EVT ArgVT = Arg.getValueType();
22586 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22588 TargetLowering::ArgListTy Args;
22589 TargetLowering::ArgListEntry Entry;
22593 Entry.isSExt = false;
22594 Entry.isZExt = false;
22595 Args.push_back(Entry);
22597 bool isF64 = ArgVT == MVT::f64;
22598 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
22599 // the small struct {f32, f32} is returned in (eax, edx). For f64,
22600 // the results are returned via SRet in memory.
22601 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
22602 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22604 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
22606 Type *RetTy = isF64
22607 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
22608 : (Type*)VectorType::get(ArgTy, 4);
22610 TargetLowering::CallLoweringInfo CLI(DAG);
22611 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
22612 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
22614 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
22617 // Returned in xmm0 and xmm1.
22618 return CallResult.first;
22620 // Returned in bits 0:31 and 32:64 xmm0.
22621 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22622 CallResult.first, DAG.getIntPtrConstant(0, dl));
22623 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22624 CallResult.first, DAG.getIntPtrConstant(1, dl));
22625 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
22626 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
22629 /// Widen a vector input to a vector of NVT. The
22630 /// input vector must have the same element type as NVT.
22631 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
22632 bool FillWithZeroes = false) {
22633 // Check if InOp already has the right width.
22634 MVT InVT = InOp.getSimpleValueType();
22638 if (InOp.isUndef())
22639 return DAG.getUNDEF(NVT);
22641 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
22642 "input and widen element type must match");
22644 unsigned InNumElts = InVT.getVectorNumElements();
22645 unsigned WidenNumElts = NVT.getVectorNumElements();
22646 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
22647 "Unexpected request for vector widening");
22649 EVT EltVT = NVT.getVectorElementType();
22652 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
22653 InOp.getNumOperands() == 2) {
22654 SDValue N1 = InOp.getOperand(1);
22655 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
22657 InOp = InOp.getOperand(0);
22658 InVT = InOp.getSimpleValueType();
22659 InNumElts = InVT.getVectorNumElements();
22662 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
22663 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
22664 SmallVector<SDValue, 16> Ops;
22665 for (unsigned i = 0; i < InNumElts; ++i)
22666 Ops.push_back(InOp.getOperand(i));
22668 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
22669 DAG.getUNDEF(EltVT);
22670 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
22671 Ops.push_back(FillVal);
22672 return DAG.getBuildVector(NVT, dl, Ops);
22674 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
22676 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
22677 InOp, DAG.getIntPtrConstant(0, dl));
22680 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
22681 SelectionDAG &DAG) {
22682 assert(Subtarget.hasAVX512() &&
22683 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22685 // X86 scatter kills mask register, so its type should be added to
22686 // the list of return values.
22687 // If the "scatter" has 2 return values, it is already handled.
22688 if (Op.getNode()->getNumValues() == 2)
22691 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
22692 SDValue Src = N->getValue();
22693 MVT VT = Src.getSimpleValueType();
22694 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
22697 SDValue NewScatter;
22698 SDValue Index = N->getIndex();
22699 SDValue Mask = N->getMask();
22700 SDValue Chain = N->getChain();
22701 SDValue BasePtr = N->getBasePtr();
22702 MVT MemVT = N->getMemoryVT().getSimpleVT();
22703 MVT IndexVT = Index.getSimpleValueType();
22704 MVT MaskVT = Mask.getSimpleValueType();
22706 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
22707 // The v2i32 value was promoted to v2i64.
22708 // Now we "redo" the type legalizer's work and widen the original
22709 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
22711 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
22712 "Unexpected memory type");
22713 int ShuffleMask[] = {0, 2, -1, -1};
22714 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
22715 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
22716 // Now we have 4 elements instead of 2.
22717 // Expand the index.
22718 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
22719 Index = ExtendToType(Index, NewIndexVT, DAG);
22721 // Expand the mask with zeroes
22722 // Mask may be <2 x i64> or <2 x i1> at this moment
22723 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
22724 "Unexpected mask type");
22725 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
22726 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22730 unsigned NumElts = VT.getVectorNumElements();
22731 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22732 !Index.getSimpleValueType().is512BitVector()) {
22733 // AVX512F supports only 512-bit vectors. Or data or index should
22734 // be 512 bit wide. If now the both index and data are 256-bit, but
22735 // the vector contains 8 elements, we just sign-extend the index
22736 if (IndexVT == MVT::v8i32)
22737 // Just extend index
22738 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22740 // The minimal number of elts in scatter is 8
22743 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22744 // Use original index here, do not modify the index twice
22745 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
22746 if (IndexVT.getScalarType() == MVT::i32)
22747 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22750 // At this point we have promoted mask operand
22751 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22752 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22753 // Use the original mask here, do not modify the mask twice
22754 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
22756 // The value that should be stored
22757 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22758 Src = ExtendToType(Src, NewVT, DAG);
22761 // If the mask is "wide" at this point - truncate it to i1 vector
22762 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
22763 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
22765 // The mask is killed by scatter, add it to the values
22766 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
22767 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
22768 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
22769 N->getMemOperand());
22770 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
22771 return SDValue(NewScatter.getNode(), 1);
22774 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
22775 SelectionDAG &DAG) {
22777 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
22778 MVT VT = Op.getSimpleValueType();
22779 MVT ScalarVT = VT.getScalarType();
22780 SDValue Mask = N->getMask();
22783 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
22784 "Expanding masked load is supported on AVX-512 target only!");
22786 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
22787 "Expanding masked load is supported for 32 and 64-bit types only!");
22789 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
22790 // VLX. These types for exp-loads are handled here.
22791 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
22794 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22795 "Cannot lower masked load op.");
22797 assert((ScalarVT.getSizeInBits() >= 32 ||
22798 (Subtarget.hasBWI() &&
22799 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22800 "Unsupported masked load op.");
22802 // This operation is legal for targets with VLX, but without
22803 // VLX the vector should be widened to 512 bit
22804 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
22805 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22806 SDValue Src0 = N->getSrc0();
22807 Src0 = ExtendToType(Src0, WideDataVT, DAG);
22809 // Mask element has to be i1.
22810 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22811 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22812 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22814 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22816 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22817 if (MaskEltTy != MVT::i1)
22818 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22819 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22820 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
22821 N->getBasePtr(), Mask, Src0,
22822 N->getMemoryVT(), N->getMemOperand(),
22823 N->getExtensionType(),
22824 N->isExpandingLoad());
22826 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22827 NewLoad.getValue(0),
22828 DAG.getIntPtrConstant(0, dl));
22829 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
22830 return DAG.getMergeValues(RetOps, dl);
22833 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
22834 SelectionDAG &DAG) {
22835 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
22836 SDValue DataToStore = N->getValue();
22837 MVT VT = DataToStore.getSimpleValueType();
22838 MVT ScalarVT = VT.getScalarType();
22839 SDValue Mask = N->getMask();
22842 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
22843 "Expanding masked load is supported on AVX-512 target only!");
22845 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
22846 "Expanding masked load is supported for 32 and 64-bit types only!");
22848 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
22849 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
22852 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
22853 "Cannot lower masked store op.");
22855 assert((ScalarVT.getSizeInBits() >= 32 ||
22856 (Subtarget.hasBWI() &&
22857 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
22858 "Unsupported masked store op.");
22860 // This operation is legal for targets with VLX, but without
22861 // VLX the vector should be widened to 512 bit
22862 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
22863 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
22865 // Mask element has to be i1.
22866 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
22867 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
22868 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
22870 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
22872 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
22873 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
22874 if (MaskEltTy != MVT::i1)
22875 Mask = DAG.getNode(ISD::TRUNCATE, dl,
22876 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
22877 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
22878 Mask, N->getMemoryVT(), N->getMemOperand(),
22879 N->isTruncatingStore(), N->isCompressingStore());
22882 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
22883 SelectionDAG &DAG) {
22884 assert(Subtarget.hasAVX512() &&
22885 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22887 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
22889 MVT VT = Op.getSimpleValueType();
22890 SDValue Index = N->getIndex();
22891 SDValue Mask = N->getMask();
22892 SDValue Src0 = N->getValue();
22893 MVT IndexVT = Index.getSimpleValueType();
22894 MVT MaskVT = Mask.getSimpleValueType();
22896 unsigned NumElts = VT.getVectorNumElements();
22897 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
22899 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
22900 !Index.getSimpleValueType().is512BitVector()) {
22901 // AVX512F supports only 512-bit vectors. Or data or index should
22902 // be 512 bit wide. If now the both index and data are 256-bit, but
22903 // the vector contains 8 elements, we just sign-extend the index
22904 if (NumElts == 8) {
22905 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22906 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
22907 N->getOperand(3), Index };
22908 DAG.UpdateNodeOperands(N, Ops);
22912 // Minimal number of elements in Gather
22915 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
22916 Index = ExtendToType(Index, NewIndexVT, DAG);
22917 if (IndexVT.getScalarType() == MVT::i32)
22918 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
22921 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
22922 // At this point we have promoted mask operand
22923 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
22924 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
22925 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
22926 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
22928 // The pass-thru value
22929 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
22930 Src0 = ExtendToType(Src0, NewVT, DAG);
22932 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
22933 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
22934 N->getMemoryVT(), dl, Ops,
22935 N->getMemOperand());
22936 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
22937 NewGather.getValue(0),
22938 DAG.getIntPtrConstant(0, dl));
22939 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
22940 return DAG.getMergeValues(RetOps, dl);
22945 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
22946 SelectionDAG &DAG) const {
22947 // TODO: Eventually, the lowering of these nodes should be informed by or
22948 // deferred to the GC strategy for the function in which they appear. For
22949 // now, however, they must be lowered to something. Since they are logically
22950 // no-ops in the case of a null GC strategy (or a GC strategy which does not
22951 // require special handling for these nodes), lower them as literal NOOPs for
22953 SmallVector<SDValue, 2> Ops;
22955 Ops.push_back(Op.getOperand(0));
22956 if (Op->getGluedNode())
22957 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
22960 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
22961 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
22966 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
22967 SelectionDAG &DAG) const {
22968 // TODO: Eventually, the lowering of these nodes should be informed by or
22969 // deferred to the GC strategy for the function in which they appear. For
22970 // now, however, they must be lowered to something. Since they are logically
22971 // no-ops in the case of a null GC strategy (or a GC strategy which does not
22972 // require special handling for these nodes), lower them as literal NOOPs for
22974 SmallVector<SDValue, 2> Ops;
22976 Ops.push_back(Op.getOperand(0));
22977 if (Op->getGluedNode())
22978 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
22981 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
22982 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
22987 /// Provide custom lowering hooks for some operations.
22988 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
22989 switch (Op.getOpcode()) {
22990 default: llvm_unreachable("Should not custom lower this!");
22991 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
22992 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
22993 return LowerCMP_SWAP(Op, Subtarget, DAG);
22994 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
22995 case ISD::ATOMIC_LOAD_ADD:
22996 case ISD::ATOMIC_LOAD_SUB:
22997 case ISD::ATOMIC_LOAD_OR:
22998 case ISD::ATOMIC_LOAD_XOR:
22999 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23000 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23001 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23002 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23003 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23004 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23005 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23006 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23007 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23008 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23009 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23010 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
23011 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23012 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23013 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23014 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23015 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23016 case ISD::SHL_PARTS:
23017 case ISD::SRA_PARTS:
23018 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23019 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23020 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23021 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23022 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23023 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23024 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23025 case ISD::ZERO_EXTEND_VECTOR_INREG:
23026 case ISD::SIGN_EXTEND_VECTOR_INREG:
23027 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23028 case ISD::FP_TO_SINT:
23029 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
23030 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23031 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23033 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23034 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23035 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23036 case ISD::SETCC: return LowerSETCC(Op, DAG);
23037 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23038 case ISD::SELECT: return LowerSELECT(Op, DAG);
23039 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23040 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23041 case ISD::VASTART: return LowerVASTART(Op, DAG);
23042 case ISD::VAARG: return LowerVAARG(Op, DAG);
23043 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23044 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23045 case ISD::INTRINSIC_VOID:
23046 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23047 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23048 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23049 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23050 case ISD::FRAME_TO_ARGS_OFFSET:
23051 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23052 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23053 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23054 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23055 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23056 case ISD::EH_SJLJ_SETUP_DISPATCH:
23057 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23058 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23059 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23060 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23062 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23064 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23065 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23067 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23068 case ISD::UMUL_LOHI:
23069 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23070 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23073 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23079 case ISD::UMULO: return LowerXALUO(Op, DAG);
23080 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23081 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23085 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23086 case ISD::ADD: return LowerADD(Op, DAG);
23087 case ISD::SUB: return LowerSUB(Op, DAG);
23091 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23092 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23093 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23094 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23095 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23096 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23097 case ISD::GC_TRANSITION_START:
23098 return LowerGC_TRANSITION_START(Op, DAG);
23099 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23100 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23104 /// Places new result values for the node in Results (their number
23105 /// and types must exactly match those of the original return values of
23106 /// the node), or leaves Results empty, which indicates that the node is not
23107 /// to be custom lowered after all.
23108 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23109 SmallVectorImpl<SDValue> &Results,
23110 SelectionDAG &DAG) const {
23111 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23113 if (!Res.getNode())
23116 assert((N->getNumValues() <= Res->getNumValues()) &&
23117 "Lowering returned the wrong number of results!");
23119 // Places new result values base on N result number.
23120 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23121 // than original node, chain should be dropped(last value).
23122 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23123 Results.push_back(Res.getValue(I));
23126 /// Replace a node with an illegal result type with a new node built out of
23128 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23129 SmallVectorImpl<SDValue>&Results,
23130 SelectionDAG &DAG) const {
23132 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23133 switch (N->getOpcode()) {
23135 llvm_unreachable("Do not know how to custom type legalize this operation!");
23136 case X86ISD::AVG: {
23137 // Legalize types for X86ISD::AVG by expanding vectors.
23138 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23140 auto InVT = N->getValueType(0);
23141 auto InVTSize = InVT.getSizeInBits();
23142 const unsigned RegSize =
23143 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23144 assert((Subtarget.hasBWI() || RegSize < 512) &&
23145 "512-bit vector requires AVX512BW");
23146 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23147 "256-bit vector requires AVX2");
23149 auto ElemVT = InVT.getVectorElementType();
23150 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23151 RegSize / ElemVT.getSizeInBits());
23152 assert(RegSize % InVT.getSizeInBits() == 0);
23153 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23155 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23156 Ops[0] = N->getOperand(0);
23157 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23158 Ops[0] = N->getOperand(1);
23159 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23161 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23162 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23163 DAG.getIntPtrConstant(0, dl)));
23166 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23167 case X86ISD::FMINC:
23169 case X86ISD::FMAXC:
23170 case X86ISD::FMAX: {
23171 EVT VT = N->getValueType(0);
23172 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23173 SDValue UNDEF = DAG.getUNDEF(VT);
23174 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23175 N->getOperand(0), UNDEF);
23176 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23177 N->getOperand(1), UNDEF);
23178 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23186 case ISD::UDIVREM: {
23187 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23188 Results.push_back(V);
23191 case ISD::FP_TO_SINT:
23192 case ISD::FP_TO_UINT: {
23193 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23195 if (N->getValueType(0) == MVT::v2i32) {
23196 assert((IsSigned || Subtarget.hasAVX512()) &&
23197 "Can only handle signed conversion without AVX512");
23198 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23199 SDValue Src = N->getOperand(0);
23200 if (Src.getValueType() == MVT::v2f64) {
23201 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23202 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23203 : X86ISD::CVTTP2UI,
23204 dl, MVT::v4i32, Src);
23205 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23206 Results.push_back(Res);
23209 if (Src.getValueType() == MVT::v2f32) {
23210 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23211 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23212 DAG.getUNDEF(MVT::v2f32));
23213 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23214 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23215 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23216 Results.push_back(Res);
23220 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23221 // so early out here.
23225 std::pair<SDValue,SDValue> Vals =
23226 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23227 SDValue FIST = Vals.first, StackSlot = Vals.second;
23228 if (FIST.getNode()) {
23229 EVT VT = N->getValueType(0);
23230 // Return a load from the stack slot.
23231 if (StackSlot.getNode())
23233 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23235 Results.push_back(FIST);
23239 case ISD::SINT_TO_FP: {
23240 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23241 SDValue Src = N->getOperand(0);
23242 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23244 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23247 case ISD::UINT_TO_FP: {
23248 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23249 EVT VT = N->getValueType(0);
23250 if (VT != MVT::v2f32)
23252 SDValue Src = N->getOperand(0);
23253 EVT SrcVT = Src.getValueType();
23254 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23255 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23258 if (SrcVT != MVT::v2i32)
23260 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23262 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23263 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23264 DAG.getBitcast(MVT::v2i64, VBias));
23265 Or = DAG.getBitcast(MVT::v2f64, Or);
23266 // TODO: Are there any fast-math-flags to propagate here?
23267 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23268 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23271 case ISD::FP_ROUND: {
23272 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23274 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23275 Results.push_back(V);
23278 case ISD::FP_EXTEND: {
23279 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23280 // No other ValueType for FP_EXTEND should reach this point.
23281 assert(N->getValueType(0) == MVT::v2f32 &&
23282 "Do not know how to legalize this Node");
23285 case ISD::INTRINSIC_W_CHAIN: {
23286 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
23288 default : llvm_unreachable("Do not know how to custom type "
23289 "legalize this intrinsic operation!");
23290 case Intrinsic::x86_rdtsc:
23291 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23293 case Intrinsic::x86_rdtscp:
23294 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
23296 case Intrinsic::x86_rdpmc:
23297 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
23299 case Intrinsic::x86_xgetbv:
23300 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
23303 case ISD::INTRINSIC_WO_CHAIN: {
23304 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
23305 Results.push_back(V);
23308 case ISD::READCYCLECOUNTER: {
23309 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23312 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
23313 EVT T = N->getValueType(0);
23314 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
23315 bool Regs64bit = T == MVT::i128;
23316 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
23317 SDValue cpInL, cpInH;
23318 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23319 DAG.getConstant(0, dl, HalfT));
23320 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23321 DAG.getConstant(1, dl, HalfT));
23322 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
23323 Regs64bit ? X86::RAX : X86::EAX,
23325 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
23326 Regs64bit ? X86::RDX : X86::EDX,
23327 cpInH, cpInL.getValue(1));
23328 SDValue swapInL, swapInH;
23329 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23330 DAG.getConstant(0, dl, HalfT));
23331 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23332 DAG.getConstant(1, dl, HalfT));
23334 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
23335 swapInH, cpInH.getValue(1));
23336 // If the current function needs the base pointer, RBX,
23337 // we shouldn't use cmpxchg directly.
23338 // Indeed the lowering of that instruction will clobber
23339 // that register and since RBX will be a reserved register
23340 // the register allocator will not make sure its value will
23341 // be properly saved and restored around this live-range.
23342 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
23344 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23345 unsigned BasePtr = TRI->getBaseRegister();
23346 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
23347 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
23348 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
23349 // ISel prefers the LCMPXCHG64 variant.
23350 // If that assert breaks, that means it is not the case anymore,
23351 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
23352 // not just EBX. This is a matter of accepting i64 input for that
23353 // pseudo, and restoring into the register of the right wide
23354 // in expand pseudo. Everything else should just work.
23355 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
23356 "Saving only half of the RBX");
23357 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
23358 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
23359 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
23360 Regs64bit ? X86::RBX : X86::EBX,
23361 HalfT, swapInH.getValue(1));
23362 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
23364 /*Glue*/ RBXSave.getValue(2)};
23365 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23368 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
23369 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
23370 Regs64bit ? X86::RBX : X86::EBX, swapInL,
23371 swapInH.getValue(1));
23372 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
23373 swapInL.getValue(1)};
23374 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23376 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
23377 Regs64bit ? X86::RAX : X86::EAX,
23378 HalfT, Result.getValue(1));
23379 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
23380 Regs64bit ? X86::RDX : X86::EDX,
23381 HalfT, cpOutL.getValue(2));
23382 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
23384 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
23385 MVT::i32, cpOutH.getValue(2));
23386 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
23387 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
23389 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
23390 Results.push_back(Success);
23391 Results.push_back(EFLAGS.getValue(1));
23394 case ISD::ATOMIC_SWAP:
23395 case ISD::ATOMIC_LOAD_ADD:
23396 case ISD::ATOMIC_LOAD_SUB:
23397 case ISD::ATOMIC_LOAD_AND:
23398 case ISD::ATOMIC_LOAD_OR:
23399 case ISD::ATOMIC_LOAD_XOR:
23400 case ISD::ATOMIC_LOAD_NAND:
23401 case ISD::ATOMIC_LOAD_MIN:
23402 case ISD::ATOMIC_LOAD_MAX:
23403 case ISD::ATOMIC_LOAD_UMIN:
23404 case ISD::ATOMIC_LOAD_UMAX:
23405 case ISD::ATOMIC_LOAD: {
23406 // Delegate to generic TypeLegalization. Situations we can really handle
23407 // should have already been dealt with by AtomicExpandPass.cpp.
23410 case ISD::BITCAST: {
23411 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23412 EVT DstVT = N->getValueType(0);
23413 EVT SrcVT = N->getOperand(0)->getValueType(0);
23415 if (SrcVT != MVT::f64 ||
23416 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
23419 unsigned NumElts = DstVT.getVectorNumElements();
23420 EVT SVT = DstVT.getVectorElementType();
23421 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23422 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
23423 MVT::v2f64, N->getOperand(0));
23424 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
23426 if (ExperimentalVectorWideningLegalization) {
23427 // If we are legalizing vectors by widening, we already have the desired
23428 // legal vector type, just return it.
23429 Results.push_back(ToVecInt);
23433 SmallVector<SDValue, 8> Elts;
23434 for (unsigned i = 0, e = NumElts; i != e; ++i)
23435 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
23436 ToVecInt, DAG.getIntPtrConstant(i, dl)));
23438 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
23443 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
23444 switch ((X86ISD::NodeType)Opcode) {
23445 case X86ISD::FIRST_NUMBER: break;
23446 case X86ISD::BSF: return "X86ISD::BSF";
23447 case X86ISD::BSR: return "X86ISD::BSR";
23448 case X86ISD::SHLD: return "X86ISD::SHLD";
23449 case X86ISD::SHRD: return "X86ISD::SHRD";
23450 case X86ISD::FAND: return "X86ISD::FAND";
23451 case X86ISD::FANDN: return "X86ISD::FANDN";
23452 case X86ISD::FOR: return "X86ISD::FOR";
23453 case X86ISD::FXOR: return "X86ISD::FXOR";
23454 case X86ISD::FILD: return "X86ISD::FILD";
23455 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
23456 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
23457 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
23458 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
23459 case X86ISD::FLD: return "X86ISD::FLD";
23460 case X86ISD::FST: return "X86ISD::FST";
23461 case X86ISD::CALL: return "X86ISD::CALL";
23462 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
23463 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
23464 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
23465 case X86ISD::BT: return "X86ISD::BT";
23466 case X86ISD::CMP: return "X86ISD::CMP";
23467 case X86ISD::COMI: return "X86ISD::COMI";
23468 case X86ISD::UCOMI: return "X86ISD::UCOMI";
23469 case X86ISD::CMPM: return "X86ISD::CMPM";
23470 case X86ISD::CMPMU: return "X86ISD::CMPMU";
23471 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
23472 case X86ISD::SETCC: return "X86ISD::SETCC";
23473 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
23474 case X86ISD::FSETCC: return "X86ISD::FSETCC";
23475 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
23476 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
23477 case X86ISD::CMOV: return "X86ISD::CMOV";
23478 case X86ISD::BRCOND: return "X86ISD::BRCOND";
23479 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
23480 case X86ISD::IRET: return "X86ISD::IRET";
23481 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
23482 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
23483 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
23484 case X86ISD::Wrapper: return "X86ISD::Wrapper";
23485 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
23486 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
23487 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
23488 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
23489 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
23490 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
23491 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
23492 case X86ISD::PINSRB: return "X86ISD::PINSRB";
23493 case X86ISD::PINSRW: return "X86ISD::PINSRW";
23494 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
23495 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
23496 case X86ISD::ANDNP: return "X86ISD::ANDNP";
23497 case X86ISD::BLENDI: return "X86ISD::BLENDI";
23498 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
23499 case X86ISD::ADDUS: return "X86ISD::ADDUS";
23500 case X86ISD::SUBUS: return "X86ISD::SUBUS";
23501 case X86ISD::HADD: return "X86ISD::HADD";
23502 case X86ISD::HSUB: return "X86ISD::HSUB";
23503 case X86ISD::FHADD: return "X86ISD::FHADD";
23504 case X86ISD::FHSUB: return "X86ISD::FHSUB";
23505 case X86ISD::ABS: return "X86ISD::ABS";
23506 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
23507 case X86ISD::FMAX: return "X86ISD::FMAX";
23508 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
23509 case X86ISD::FMIN: return "X86ISD::FMIN";
23510 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
23511 case X86ISD::FMAXC: return "X86ISD::FMAXC";
23512 case X86ISD::FMINC: return "X86ISD::FMINC";
23513 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
23514 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
23515 case X86ISD::FRCP: return "X86ISD::FRCP";
23516 case X86ISD::FRCPS: return "X86ISD::FRCPS";
23517 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
23518 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
23519 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
23520 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
23521 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
23522 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
23523 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
23524 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
23525 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
23526 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
23527 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
23528 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
23529 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
23530 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
23531 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
23532 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
23533 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
23534 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
23535 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
23536 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
23537 case X86ISD::LADD: return "X86ISD::LADD";
23538 case X86ISD::LSUB: return "X86ISD::LSUB";
23539 case X86ISD::LOR: return "X86ISD::LOR";
23540 case X86ISD::LXOR: return "X86ISD::LXOR";
23541 case X86ISD::LAND: return "X86ISD::LAND";
23542 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
23543 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
23544 case X86ISD::VZEXT: return "X86ISD::VZEXT";
23545 case X86ISD::VSEXT: return "X86ISD::VSEXT";
23546 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
23547 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
23548 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
23549 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
23550 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
23551 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
23552 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
23553 case X86ISD::VINSERT: return "X86ISD::VINSERT";
23554 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
23555 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
23556 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
23557 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
23558 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
23559 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
23560 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
23561 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
23562 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
23563 case X86ISD::VSHL: return "X86ISD::VSHL";
23564 case X86ISD::VSRL: return "X86ISD::VSRL";
23565 case X86ISD::VSRA: return "X86ISD::VSRA";
23566 case X86ISD::VSHLI: return "X86ISD::VSHLI";
23567 case X86ISD::VSRLI: return "X86ISD::VSRLI";
23568 case X86ISD::VSRAI: return "X86ISD::VSRAI";
23569 case X86ISD::VSRAV: return "X86ISD::VSRAV";
23570 case X86ISD::VROTLI: return "X86ISD::VROTLI";
23571 case X86ISD::VROTRI: return "X86ISD::VROTRI";
23572 case X86ISD::VPPERM: return "X86ISD::VPPERM";
23573 case X86ISD::CMPP: return "X86ISD::CMPP";
23574 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
23575 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
23576 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
23577 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
23578 case X86ISD::ADD: return "X86ISD::ADD";
23579 case X86ISD::SUB: return "X86ISD::SUB";
23580 case X86ISD::ADC: return "X86ISD::ADC";
23581 case X86ISD::SBB: return "X86ISD::SBB";
23582 case X86ISD::SMUL: return "X86ISD::SMUL";
23583 case X86ISD::UMUL: return "X86ISD::UMUL";
23584 case X86ISD::SMUL8: return "X86ISD::SMUL8";
23585 case X86ISD::UMUL8: return "X86ISD::UMUL8";
23586 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
23587 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
23588 case X86ISD::INC: return "X86ISD::INC";
23589 case X86ISD::DEC: return "X86ISD::DEC";
23590 case X86ISD::OR: return "X86ISD::OR";
23591 case X86ISD::XOR: return "X86ISD::XOR";
23592 case X86ISD::AND: return "X86ISD::AND";
23593 case X86ISD::BEXTR: return "X86ISD::BEXTR";
23594 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
23595 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
23596 case X86ISD::PTEST: return "X86ISD::PTEST";
23597 case X86ISD::TESTP: return "X86ISD::TESTP";
23598 case X86ISD::TESTM: return "X86ISD::TESTM";
23599 case X86ISD::TESTNM: return "X86ISD::TESTNM";
23600 case X86ISD::KORTEST: return "X86ISD::KORTEST";
23601 case X86ISD::KTEST: return "X86ISD::KTEST";
23602 case X86ISD::PACKSS: return "X86ISD::PACKSS";
23603 case X86ISD::PACKUS: return "X86ISD::PACKUS";
23604 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
23605 case X86ISD::VALIGN: return "X86ISD::VALIGN";
23606 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
23607 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
23608 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
23609 case X86ISD::SHUFP: return "X86ISD::SHUFP";
23610 case X86ISD::SHUF128: return "X86ISD::SHUF128";
23611 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
23612 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
23613 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
23614 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
23615 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
23616 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
23617 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
23618 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
23619 case X86ISD::MOVSD: return "X86ISD::MOVSD";
23620 case X86ISD::MOVSS: return "X86ISD::MOVSS";
23621 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
23622 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
23623 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
23624 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
23625 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
23626 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
23627 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
23628 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
23629 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
23630 case X86ISD::VPERMV: return "X86ISD::VPERMV";
23631 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
23632 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
23633 case X86ISD::VPERMI: return "X86ISD::VPERMI";
23634 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
23635 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
23636 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
23637 case X86ISD::VRANGE: return "X86ISD::VRANGE";
23638 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
23639 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
23640 case X86ISD::PSADBW: return "X86ISD::PSADBW";
23641 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
23642 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
23643 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
23644 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
23645 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
23646 case X86ISD::MFENCE: return "X86ISD::MFENCE";
23647 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
23648 case X86ISD::SAHF: return "X86ISD::SAHF";
23649 case X86ISD::RDRAND: return "X86ISD::RDRAND";
23650 case X86ISD::RDSEED: return "X86ISD::RDSEED";
23651 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
23652 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
23653 case X86ISD::VPROT: return "X86ISD::VPROT";
23654 case X86ISD::VPROTI: return "X86ISD::VPROTI";
23655 case X86ISD::VPSHA: return "X86ISD::VPSHA";
23656 case X86ISD::VPSHL: return "X86ISD::VPSHL";
23657 case X86ISD::VPCOM: return "X86ISD::VPCOM";
23658 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
23659 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
23660 case X86ISD::FMADD: return "X86ISD::FMADD";
23661 case X86ISD::FMSUB: return "X86ISD::FMSUB";
23662 case X86ISD::FNMADD: return "X86ISD::FNMADD";
23663 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
23664 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
23665 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
23666 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
23667 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
23668 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
23669 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
23670 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
23671 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
23672 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
23673 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
23674 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
23675 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
23676 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
23677 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
23678 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
23679 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
23680 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
23681 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
23682 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
23683 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
23684 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
23685 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
23686 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
23687 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
23688 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
23689 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
23690 case X86ISD::XTEST: return "X86ISD::XTEST";
23691 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
23692 case X86ISD::EXPAND: return "X86ISD::EXPAND";
23693 case X86ISD::SELECT: return "X86ISD::SELECT";
23694 case X86ISD::SELECTS: return "X86ISD::SELECTS";
23695 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
23696 case X86ISD::RCP28: return "X86ISD::RCP28";
23697 case X86ISD::RCP28S: return "X86ISD::RCP28S";
23698 case X86ISD::EXP2: return "X86ISD::EXP2";
23699 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
23700 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
23701 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
23702 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
23703 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
23704 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
23705 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
23706 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
23707 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
23708 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
23709 case X86ISD::SCALEF: return "X86ISD::SCALEF";
23710 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
23711 case X86ISD::ADDS: return "X86ISD::ADDS";
23712 case X86ISD::SUBS: return "X86ISD::SUBS";
23713 case X86ISD::AVG: return "X86ISD::AVG";
23714 case X86ISD::MULHRS: return "X86ISD::MULHRS";
23715 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
23716 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
23717 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
23718 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
23719 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
23720 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
23721 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
23722 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
23723 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
23724 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
23725 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
23726 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
23727 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
23728 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
23729 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
23730 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
23731 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
23732 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
23733 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
23734 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
23735 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
23736 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
23737 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
23742 /// Return true if the addressing mode represented by AM is legal for this
23743 /// target, for a load/store of the specified type.
23744 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
23745 const AddrMode &AM, Type *Ty,
23746 unsigned AS) const {
23747 // X86 supports extremely general addressing modes.
23748 CodeModel::Model M = getTargetMachine().getCodeModel();
23750 // X86 allows a sign-extended 32-bit immediate field as a displacement.
23751 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
23755 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
23757 // If a reference to this global requires an extra load, we can't fold it.
23758 if (isGlobalStubReference(GVFlags))
23761 // If BaseGV requires a register for the PIC base, we cannot also have a
23762 // BaseReg specified.
23763 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
23766 // If lower 4G is not available, then we must use rip-relative addressing.
23767 if ((M != CodeModel::Small || isPositionIndependent()) &&
23768 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
23772 switch (AM.Scale) {
23778 // These scales always work.
23783 // These scales are formed with basereg+scalereg. Only accept if there is
23788 default: // Other stuff never works.
23795 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
23796 unsigned Bits = Ty->getScalarSizeInBits();
23798 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
23799 // particularly cheaper than those without.
23803 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
23804 // variable shifts just as cheap as scalar ones.
23805 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
23808 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
23809 // fully general vector.
23813 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
23814 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23816 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
23817 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
23818 return NumBits1 > NumBits2;
23821 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
23822 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
23825 if (!isTypeLegal(EVT::getEVT(Ty1)))
23828 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
23830 // Assuming the caller doesn't have a zeroext or signext return parameter,
23831 // truncation all the way down to i1 is valid.
23835 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
23836 return isInt<32>(Imm);
23839 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
23840 // Can also use sub to handle negated immediates.
23841 return isInt<32>(Imm);
23844 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
23845 if (!VT1.isInteger() || !VT2.isInteger())
23847 unsigned NumBits1 = VT1.getSizeInBits();
23848 unsigned NumBits2 = VT2.getSizeInBits();
23849 return NumBits1 > NumBits2;
23852 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
23853 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23854 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
23857 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
23858 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
23859 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
23862 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
23863 EVT VT1 = Val.getValueType();
23864 if (isZExtFree(VT1, VT2))
23867 if (Val.getOpcode() != ISD::LOAD)
23870 if (!VT1.isSimple() || !VT1.isInteger() ||
23871 !VT2.isSimple() || !VT2.isInteger())
23874 switch (VT1.getSimpleVT().SimpleTy) {
23879 // X86 has 8, 16, and 32-bit zero-extending loads.
23886 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
23889 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
23890 if (!Subtarget.hasAnyFMA())
23893 VT = VT.getScalarType();
23895 if (!VT.isSimple())
23898 switch (VT.getSimpleVT().SimpleTy) {
23909 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
23910 // i16 instructions are longer (0x66 prefix) and potentially slower.
23911 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
23914 /// Targets can use this to indicate that they only support *some*
23915 /// VECTOR_SHUFFLE operations, those with specific masks.
23916 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
23917 /// are assumed to be legal.
23919 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
23921 if (!VT.isSimple())
23924 // Not for i1 vectors
23925 if (VT.getSimpleVT().getScalarType() == MVT::i1)
23928 // Very little shuffling can be done for 64-bit vectors right now.
23929 if (VT.getSimpleVT().getSizeInBits() == 64)
23932 // We only care that the types being shuffled are legal. The lowering can
23933 // handle any possible shuffle mask that results.
23934 return isTypeLegal(VT.getSimpleVT());
23938 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
23940 // Just delegate to the generic legality, clear masks aren't special.
23941 return isShuffleMaskLegal(Mask, VT);
23944 //===----------------------------------------------------------------------===//
23945 // X86 Scheduler Hooks
23946 //===----------------------------------------------------------------------===//
23948 /// Utility function to emit xbegin specifying the start of an RTM region.
23949 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
23950 const TargetInstrInfo *TII) {
23951 DebugLoc DL = MI.getDebugLoc();
23953 const BasicBlock *BB = MBB->getBasicBlock();
23954 MachineFunction::iterator I = ++MBB->getIterator();
23956 // For the v = xbegin(), we generate
23967 MachineBasicBlock *thisMBB = MBB;
23968 MachineFunction *MF = MBB->getParent();
23969 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
23970 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
23971 MF->insert(I, mainMBB);
23972 MF->insert(I, sinkMBB);
23974 // Transfer the remainder of BB and its successor edges to sinkMBB.
23975 sinkMBB->splice(sinkMBB->begin(), MBB,
23976 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
23977 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
23981 // # fallthrough to mainMBB
23982 // # abortion to sinkMBB
23983 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
23984 thisMBB->addSuccessor(mainMBB);
23985 thisMBB->addSuccessor(sinkMBB);
23989 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
23990 mainMBB->addSuccessor(sinkMBB);
23993 // EAX is live into the sinkMBB
23994 sinkMBB->addLiveIn(X86::EAX);
23995 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
23996 MI.getOperand(0).getReg())
23999 MI.eraseFromParent();
24003 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24004 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24005 // in the .td file.
24006 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24007 const TargetInstrInfo *TII) {
24009 switch (MI.getOpcode()) {
24010 default: llvm_unreachable("illegal opcode!");
24011 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24012 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24013 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24014 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24015 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24016 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24017 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24018 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24021 DebugLoc dl = MI.getDebugLoc();
24022 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24024 unsigned NumArgs = MI.getNumOperands();
24025 for (unsigned i = 1; i < NumArgs; ++i) {
24026 MachineOperand &Op = MI.getOperand(i);
24027 if (!(Op.isReg() && Op.isImplicit()))
24028 MIB.addOperand(Op);
24030 if (MI.hasOneMemOperand())
24031 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24033 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24034 .addReg(X86::XMM0);
24036 MI.eraseFromParent();
24040 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24041 // defs in an instruction pattern
24042 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24043 const TargetInstrInfo *TII) {
24045 switch (MI.getOpcode()) {
24046 default: llvm_unreachable("illegal opcode!");
24047 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24048 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24049 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24050 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24051 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24052 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24053 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24054 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24057 DebugLoc dl = MI.getDebugLoc();
24058 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24060 unsigned NumArgs = MI.getNumOperands(); // remove the results
24061 for (unsigned i = 1; i < NumArgs; ++i) {
24062 MachineOperand &Op = MI.getOperand(i);
24063 if (!(Op.isReg() && Op.isImplicit()))
24064 MIB.addOperand(Op);
24066 if (MI.hasOneMemOperand())
24067 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24069 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24072 MI.eraseFromParent();
24076 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24077 const X86Subtarget &Subtarget) {
24078 DebugLoc dl = MI.getDebugLoc();
24079 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24081 // insert input VAL into EAX
24082 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24083 .addReg(MI.getOperand(0).getReg());
24084 // insert zero to ECX
24085 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24087 // insert zero to EDX
24088 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24090 // insert WRPKRU instruction
24091 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24093 MI.eraseFromParent(); // The pseudo is gone now.
24097 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24098 const X86Subtarget &Subtarget) {
24099 DebugLoc dl = MI.getDebugLoc();
24100 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24102 // insert zero to ECX
24103 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24105 // insert RDPKRU instruction
24106 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24107 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24110 MI.eraseFromParent(); // The pseudo is gone now.
24114 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24115 const X86Subtarget &Subtarget,
24117 DebugLoc dl = MI.getDebugLoc();
24118 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24119 // Address into RAX/EAX, other two args into ECX, EDX.
24120 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24121 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24122 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24123 for (int i = 0; i < X86::AddrNumOperands; ++i)
24124 MIB.addOperand(MI.getOperand(i));
24126 unsigned ValOps = X86::AddrNumOperands;
24127 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24128 .addReg(MI.getOperand(ValOps).getReg());
24129 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24130 .addReg(MI.getOperand(ValOps + 1).getReg());
24132 // The instruction doesn't actually take any operands though.
24133 BuildMI(*BB, MI, dl, TII->get(Opc));
24135 MI.eraseFromParent(); // The pseudo is gone now.
24139 MachineBasicBlock *
24140 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24141 MachineBasicBlock *MBB) const {
24142 // Emit va_arg instruction on X86-64.
24144 // Operands to this pseudo-instruction:
24145 // 0 ) Output : destination address (reg)
24146 // 1-5) Input : va_list address (addr, i64mem)
24147 // 6 ) ArgSize : Size (in bytes) of vararg type
24148 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24149 // 8 ) Align : Alignment of type
24150 // 9 ) EFLAGS (implicit-def)
24152 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24153 static_assert(X86::AddrNumOperands == 5,
24154 "VAARG_64 assumes 5 address operands");
24156 unsigned DestReg = MI.getOperand(0).getReg();
24157 MachineOperand &Base = MI.getOperand(1);
24158 MachineOperand &Scale = MI.getOperand(2);
24159 MachineOperand &Index = MI.getOperand(3);
24160 MachineOperand &Disp = MI.getOperand(4);
24161 MachineOperand &Segment = MI.getOperand(5);
24162 unsigned ArgSize = MI.getOperand(6).getImm();
24163 unsigned ArgMode = MI.getOperand(7).getImm();
24164 unsigned Align = MI.getOperand(8).getImm();
24166 // Memory Reference
24167 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24168 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24169 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24171 // Machine Information
24172 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24173 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24174 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24175 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24176 DebugLoc DL = MI.getDebugLoc();
24178 // struct va_list {
24181 // i64 overflow_area (address)
24182 // i64 reg_save_area (address)
24184 // sizeof(va_list) = 24
24185 // alignment(va_list) = 8
24187 unsigned TotalNumIntRegs = 6;
24188 unsigned TotalNumXMMRegs = 8;
24189 bool UseGPOffset = (ArgMode == 1);
24190 bool UseFPOffset = (ArgMode == 2);
24191 unsigned MaxOffset = TotalNumIntRegs * 8 +
24192 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24194 /* Align ArgSize to a multiple of 8 */
24195 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24196 bool NeedsAlign = (Align > 8);
24198 MachineBasicBlock *thisMBB = MBB;
24199 MachineBasicBlock *overflowMBB;
24200 MachineBasicBlock *offsetMBB;
24201 MachineBasicBlock *endMBB;
24203 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24204 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24205 unsigned OffsetReg = 0;
24207 if (!UseGPOffset && !UseFPOffset) {
24208 // If we only pull from the overflow region, we don't create a branch.
24209 // We don't need to alter control flow.
24210 OffsetDestReg = 0; // unused
24211 OverflowDestReg = DestReg;
24213 offsetMBB = nullptr;
24214 overflowMBB = thisMBB;
24217 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24218 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24219 // If not, pull from overflow_area. (branch to overflowMBB)
24224 // offsetMBB overflowMBB
24229 // Registers for the PHI in endMBB
24230 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24231 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24233 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24234 MachineFunction *MF = MBB->getParent();
24235 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24236 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24237 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24239 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24241 // Insert the new basic blocks
24242 MF->insert(MBBIter, offsetMBB);
24243 MF->insert(MBBIter, overflowMBB);
24244 MF->insert(MBBIter, endMBB);
24246 // Transfer the remainder of MBB and its successor edges to endMBB.
24247 endMBB->splice(endMBB->begin(), thisMBB,
24248 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
24249 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
24251 // Make offsetMBB and overflowMBB successors of thisMBB
24252 thisMBB->addSuccessor(offsetMBB);
24253 thisMBB->addSuccessor(overflowMBB);
24255 // endMBB is a successor of both offsetMBB and overflowMBB
24256 offsetMBB->addSuccessor(endMBB);
24257 overflowMBB->addSuccessor(endMBB);
24259 // Load the offset value into a register
24260 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24261 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
24265 .addDisp(Disp, UseFPOffset ? 4 : 0)
24266 .addOperand(Segment)
24267 .setMemRefs(MMOBegin, MMOEnd);
24269 // Check if there is enough room left to pull this argument.
24270 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
24272 .addImm(MaxOffset + 8 - ArgSizeA8);
24274 // Branch to "overflowMBB" if offset >= max
24275 // Fall through to "offsetMBB" otherwise
24276 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
24277 .addMBB(overflowMBB);
24280 // In offsetMBB, emit code to use the reg_save_area.
24282 assert(OffsetReg != 0);
24284 // Read the reg_save_area address.
24285 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
24286 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
24291 .addOperand(Segment)
24292 .setMemRefs(MMOBegin, MMOEnd);
24294 // Zero-extend the offset
24295 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
24296 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
24299 .addImm(X86::sub_32bit);
24301 // Add the offset to the reg_save_area to get the final address.
24302 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
24303 .addReg(OffsetReg64)
24304 .addReg(RegSaveReg);
24306 // Compute the offset for the next argument
24307 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24308 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
24310 .addImm(UseFPOffset ? 16 : 8);
24312 // Store it back into the va_list.
24313 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
24317 .addDisp(Disp, UseFPOffset ? 4 : 0)
24318 .addOperand(Segment)
24319 .addReg(NextOffsetReg)
24320 .setMemRefs(MMOBegin, MMOEnd);
24323 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
24328 // Emit code to use overflow area
24331 // Load the overflow_area address into a register.
24332 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
24333 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
24338 .addOperand(Segment)
24339 .setMemRefs(MMOBegin, MMOEnd);
24341 // If we need to align it, do so. Otherwise, just copy the address
24342 // to OverflowDestReg.
24344 // Align the overflow address
24345 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
24346 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
24348 // aligned_addr = (addr + (align-1)) & ~(align-1)
24349 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
24350 .addReg(OverflowAddrReg)
24353 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
24355 .addImm(~(uint64_t)(Align-1));
24357 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
24358 .addReg(OverflowAddrReg);
24361 // Compute the next overflow address after this argument.
24362 // (the overflow address should be kept 8-byte aligned)
24363 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
24364 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
24365 .addReg(OverflowDestReg)
24366 .addImm(ArgSizeA8);
24368 // Store the new overflow address.
24369 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
24374 .addOperand(Segment)
24375 .addReg(NextAddrReg)
24376 .setMemRefs(MMOBegin, MMOEnd);
24378 // If we branched, emit the PHI to the front of endMBB.
24380 BuildMI(*endMBB, endMBB->begin(), DL,
24381 TII->get(X86::PHI), DestReg)
24382 .addReg(OffsetDestReg).addMBB(offsetMBB)
24383 .addReg(OverflowDestReg).addMBB(overflowMBB);
24386 // Erase the pseudo instruction
24387 MI.eraseFromParent();
24392 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
24393 MachineInstr &MI, MachineBasicBlock *MBB) const {
24394 // Emit code to save XMM registers to the stack. The ABI says that the
24395 // number of registers to save is given in %al, so it's theoretically
24396 // possible to do an indirect jump trick to avoid saving all of them,
24397 // however this code takes a simpler approach and just executes all
24398 // of the stores if %al is non-zero. It's less code, and it's probably
24399 // easier on the hardware branch predictor, and stores aren't all that
24400 // expensive anyway.
24402 // Create the new basic blocks. One block contains all the XMM stores,
24403 // and one block is the final destination regardless of whether any
24404 // stores were performed.
24405 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24406 MachineFunction *F = MBB->getParent();
24407 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24408 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
24409 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
24410 F->insert(MBBIter, XMMSaveMBB);
24411 F->insert(MBBIter, EndMBB);
24413 // Transfer the remainder of MBB and its successor edges to EndMBB.
24414 EndMBB->splice(EndMBB->begin(), MBB,
24415 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24416 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
24418 // The original block will now fall through to the XMM save block.
24419 MBB->addSuccessor(XMMSaveMBB);
24420 // The XMMSaveMBB will fall through to the end block.
24421 XMMSaveMBB->addSuccessor(EndMBB);
24423 // Now add the instructions.
24424 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24425 DebugLoc DL = MI.getDebugLoc();
24427 unsigned CountReg = MI.getOperand(0).getReg();
24428 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
24429 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
24431 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
24432 // If %al is 0, branch around the XMM save block.
24433 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
24434 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
24435 MBB->addSuccessor(EndMBB);
24438 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
24439 // that was just emitted, but clearly shouldn't be "saved".
24440 assert((MI.getNumOperands() <= 3 ||
24441 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
24442 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
24443 "Expected last argument to be EFLAGS");
24444 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
24445 // In the XMM save block, save all the XMM argument registers.
24446 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
24447 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
24448 MachineMemOperand *MMO = F->getMachineMemOperand(
24449 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
24450 MachineMemOperand::MOStore,
24451 /*Size=*/16, /*Align=*/16);
24452 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
24453 .addFrameIndex(RegSaveFrameIndex)
24454 .addImm(/*Scale=*/1)
24455 .addReg(/*IndexReg=*/0)
24456 .addImm(/*Disp=*/Offset)
24457 .addReg(/*Segment=*/0)
24458 .addReg(MI.getOperand(i).getReg())
24459 .addMemOperand(MMO);
24462 MI.eraseFromParent(); // The pseudo instruction is gone now.
24467 // The EFLAGS operand of SelectItr might be missing a kill marker
24468 // because there were multiple uses of EFLAGS, and ISel didn't know
24469 // which to mark. Figure out whether SelectItr should have had a
24470 // kill marker, and set it if it should. Returns the correct kill
24472 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
24473 MachineBasicBlock* BB,
24474 const TargetRegisterInfo* TRI) {
24475 // Scan forward through BB for a use/def of EFLAGS.
24476 MachineBasicBlock::iterator miI(std::next(SelectItr));
24477 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
24478 const MachineInstr& mi = *miI;
24479 if (mi.readsRegister(X86::EFLAGS))
24481 if (mi.definesRegister(X86::EFLAGS))
24482 break; // Should have kill-flag - update below.
24485 // If we hit the end of the block, check whether EFLAGS is live into a
24487 if (miI == BB->end()) {
24488 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
24489 sEnd = BB->succ_end();
24490 sItr != sEnd; ++sItr) {
24491 MachineBasicBlock* succ = *sItr;
24492 if (succ->isLiveIn(X86::EFLAGS))
24497 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
24498 // out. SelectMI should have a kill flag on EFLAGS.
24499 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
24503 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
24504 // together with other CMOV pseudo-opcodes into a single basic-block with
24505 // conditional jump around it.
24506 static bool isCMOVPseudo(MachineInstr &MI) {
24507 switch (MI.getOpcode()) {
24508 case X86::CMOV_FR32:
24509 case X86::CMOV_FR64:
24510 case X86::CMOV_GR8:
24511 case X86::CMOV_GR16:
24512 case X86::CMOV_GR32:
24513 case X86::CMOV_RFP32:
24514 case X86::CMOV_RFP64:
24515 case X86::CMOV_RFP80:
24516 case X86::CMOV_V2F64:
24517 case X86::CMOV_V2I64:
24518 case X86::CMOV_V4F32:
24519 case X86::CMOV_V4F64:
24520 case X86::CMOV_V4I64:
24521 case X86::CMOV_V16F32:
24522 case X86::CMOV_V8F32:
24523 case X86::CMOV_V8F64:
24524 case X86::CMOV_V8I64:
24525 case X86::CMOV_V8I1:
24526 case X86::CMOV_V16I1:
24527 case X86::CMOV_V32I1:
24528 case X86::CMOV_V64I1:
24536 MachineBasicBlock *
24537 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
24538 MachineBasicBlock *BB) const {
24539 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24540 DebugLoc DL = MI.getDebugLoc();
24542 // To "insert" a SELECT_CC instruction, we actually have to insert the
24543 // diamond control-flow pattern. The incoming instruction knows the
24544 // destination vreg to set, the condition code register to branch on, the
24545 // true/false values to select between, and a branch opcode to use.
24546 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24547 MachineFunction::iterator It = ++BB->getIterator();
24552 // cmpTY ccX, r1, r2
24554 // fallthrough --> copy0MBB
24555 MachineBasicBlock *thisMBB = BB;
24556 MachineFunction *F = BB->getParent();
24558 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
24559 // as described above, by inserting a BB, and then making a PHI at the join
24560 // point to select the true and false operands of the CMOV in the PHI.
24562 // The code also handles two different cases of multiple CMOV opcodes
24566 // In this case, there are multiple CMOVs in a row, all which are based on
24567 // the same condition setting (or the exact opposite condition setting).
24568 // In this case we can lower all the CMOVs using a single inserted BB, and
24569 // then make a number of PHIs at the join point to model the CMOVs. The only
24570 // trickiness here, is that in a case like:
24572 // t2 = CMOV cond1 t1, f1
24573 // t3 = CMOV cond1 t2, f2
24575 // when rewriting this into PHIs, we have to perform some renaming on the
24576 // temps since you cannot have a PHI operand refer to a PHI result earlier
24577 // in the same block. The "simple" but wrong lowering would be:
24579 // t2 = PHI t1(BB1), f1(BB2)
24580 // t3 = PHI t2(BB1), f2(BB2)
24582 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
24583 // renaming is to note that on the path through BB1, t2 is really just a
24584 // copy of t1, and do that renaming, properly generating:
24586 // t2 = PHI t1(BB1), f1(BB2)
24587 // t3 = PHI t1(BB1), f2(BB2)
24589 // Case 2, we lower cascaded CMOVs such as
24591 // (CMOV (CMOV F, T, cc1), T, cc2)
24593 // to two successives branches. For that, we look for another CMOV as the
24594 // following instruction.
24596 // Without this, we would add a PHI between the two jumps, which ends up
24597 // creating a few copies all around. For instance, for
24599 // (sitofp (zext (fcmp une)))
24601 // we would generate:
24603 // ucomiss %xmm1, %xmm0
24604 // movss <1.0f>, %xmm0
24605 // movaps %xmm0, %xmm1
24607 // xorps %xmm1, %xmm1
24610 // movaps %xmm1, %xmm0
24614 // because this custom-inserter would have generated:
24626 // A: X = ...; Y = ...
24628 // C: Z = PHI [X, A], [Y, B]
24630 // E: PHI [X, C], [Z, D]
24632 // If we lower both CMOVs in a single step, we can instead generate:
24644 // A: X = ...; Y = ...
24646 // E: PHI [X, A], [X, C], [Y, D]
24648 // Which, in our sitofp/fcmp example, gives us something like:
24650 // ucomiss %xmm1, %xmm0
24651 // movss <1.0f>, %xmm0
24654 // xorps %xmm0, %xmm0
24658 MachineInstr *CascadedCMOV = nullptr;
24659 MachineInstr *LastCMOV = &MI;
24660 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
24661 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
24662 MachineBasicBlock::iterator NextMIIt =
24663 std::next(MachineBasicBlock::iterator(MI));
24665 // Check for case 1, where there are multiple CMOVs with the same condition
24666 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
24667 // number of jumps the most.
24669 if (isCMOVPseudo(MI)) {
24670 // See if we have a string of CMOVS with the same condition.
24671 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
24672 (NextMIIt->getOperand(3).getImm() == CC ||
24673 NextMIIt->getOperand(3).getImm() == OppCC)) {
24674 LastCMOV = &*NextMIIt;
24679 // This checks for case 2, but only do this if we didn't already find
24680 // case 1, as indicated by LastCMOV == MI.
24681 if (LastCMOV == &MI && NextMIIt != BB->end() &&
24682 NextMIIt->getOpcode() == MI.getOpcode() &&
24683 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
24684 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
24685 NextMIIt->getOperand(1).isKill()) {
24686 CascadedCMOV = &*NextMIIt;
24689 MachineBasicBlock *jcc1MBB = nullptr;
24691 // If we have a cascaded CMOV, we lower it to two successive branches to
24692 // the same block. EFLAGS is used by both, so mark it as live in the second.
24693 if (CascadedCMOV) {
24694 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
24695 F->insert(It, jcc1MBB);
24696 jcc1MBB->addLiveIn(X86::EFLAGS);
24699 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
24700 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
24701 F->insert(It, copy0MBB);
24702 F->insert(It, sinkMBB);
24704 // If the EFLAGS register isn't dead in the terminator, then claim that it's
24705 // live into the sink and copy blocks.
24706 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
24708 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
24709 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
24710 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
24711 copy0MBB->addLiveIn(X86::EFLAGS);
24712 sinkMBB->addLiveIn(X86::EFLAGS);
24715 // Transfer the remainder of BB and its successor edges to sinkMBB.
24716 sinkMBB->splice(sinkMBB->begin(), BB,
24717 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
24718 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
24720 // Add the true and fallthrough blocks as its successors.
24721 if (CascadedCMOV) {
24722 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
24723 BB->addSuccessor(jcc1MBB);
24725 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
24726 // jump to the sinkMBB.
24727 jcc1MBB->addSuccessor(copy0MBB);
24728 jcc1MBB->addSuccessor(sinkMBB);
24730 BB->addSuccessor(copy0MBB);
24733 // The true block target of the first (or only) branch is always sinkMBB.
24734 BB->addSuccessor(sinkMBB);
24736 // Create the conditional branch instruction.
24737 unsigned Opc = X86::GetCondBranchFromCond(CC);
24738 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
24740 if (CascadedCMOV) {
24741 unsigned Opc2 = X86::GetCondBranchFromCond(
24742 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
24743 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
24747 // %FalseValue = ...
24748 // # fallthrough to sinkMBB
24749 copy0MBB->addSuccessor(sinkMBB);
24752 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
24754 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
24755 MachineBasicBlock::iterator MIItEnd =
24756 std::next(MachineBasicBlock::iterator(LastCMOV));
24757 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
24758 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
24759 MachineInstrBuilder MIB;
24761 // As we are creating the PHIs, we have to be careful if there is more than
24762 // one. Later CMOVs may reference the results of earlier CMOVs, but later
24763 // PHIs have to reference the individual true/false inputs from earlier PHIs.
24764 // That also means that PHI construction must work forward from earlier to
24765 // later, and that the code must maintain a mapping from earlier PHI's
24766 // destination registers, and the registers that went into the PHI.
24768 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
24769 unsigned DestReg = MIIt->getOperand(0).getReg();
24770 unsigned Op1Reg = MIIt->getOperand(1).getReg();
24771 unsigned Op2Reg = MIIt->getOperand(2).getReg();
24773 // If this CMOV we are generating is the opposite condition from
24774 // the jump we generated, then we have to swap the operands for the
24775 // PHI that is going to be generated.
24776 if (MIIt->getOperand(3).getImm() == OppCC)
24777 std::swap(Op1Reg, Op2Reg);
24779 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
24780 Op1Reg = RegRewriteTable[Op1Reg].first;
24782 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
24783 Op2Reg = RegRewriteTable[Op2Reg].second;
24785 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
24786 TII->get(X86::PHI), DestReg)
24787 .addReg(Op1Reg).addMBB(copy0MBB)
24788 .addReg(Op2Reg).addMBB(thisMBB);
24790 // Add this PHI to the rewrite table.
24791 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
24794 // If we have a cascaded CMOV, the second Jcc provides the same incoming
24795 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
24796 if (CascadedCMOV) {
24797 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
24798 // Copy the PHI result to the register defined by the second CMOV.
24799 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
24800 DL, TII->get(TargetOpcode::COPY),
24801 CascadedCMOV->getOperand(0).getReg())
24802 .addReg(MI.getOperand(0).getReg());
24803 CascadedCMOV->eraseFromParent();
24806 // Now remove the CMOV(s).
24807 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
24808 (MIIt++)->eraseFromParent();
24813 MachineBasicBlock *
24814 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
24815 MachineBasicBlock *BB) const {
24816 // Combine the following atomic floating-point modification pattern:
24817 // a.store(reg OP a.load(acquire), release)
24818 // Transform them into:
24819 // OPss (%gpr), %xmm
24820 // movss %xmm, (%gpr)
24821 // Or sd equivalent for 64-bit operations.
24823 switch (MI.getOpcode()) {
24824 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
24825 case X86::RELEASE_FADD32mr:
24826 FOp = X86::ADDSSrm;
24827 MOp = X86::MOVSSmr;
24829 case X86::RELEASE_FADD64mr:
24830 FOp = X86::ADDSDrm;
24831 MOp = X86::MOVSDmr;
24834 const X86InstrInfo *TII = Subtarget.getInstrInfo();
24835 DebugLoc DL = MI.getDebugLoc();
24836 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
24837 unsigned ValOpIdx = X86::AddrNumOperands;
24838 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
24839 MachineInstrBuilder MIB =
24840 BuildMI(*BB, MI, DL, TII->get(FOp),
24841 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
24843 for (int i = 0; i < X86::AddrNumOperands; ++i) {
24844 MachineOperand &Operand = MI.getOperand(i);
24845 // Clear any kill flags on register operands as we'll create a second
24846 // instruction using the same address operands.
24847 if (Operand.isReg())
24848 Operand.setIsKill(false);
24849 MIB.addOperand(Operand);
24851 MachineInstr *FOpMI = MIB;
24852 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
24853 for (int i = 0; i < X86::AddrNumOperands; ++i)
24854 MIB.addOperand(MI.getOperand(i));
24855 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
24856 MI.eraseFromParent(); // The pseudo instruction is gone now.
24860 MachineBasicBlock *
24861 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
24862 MachineBasicBlock *BB) const {
24863 MachineFunction *MF = BB->getParent();
24864 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24865 DebugLoc DL = MI.getDebugLoc();
24866 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24868 assert(MF->shouldSplitStack());
24870 const bool Is64Bit = Subtarget.is64Bit();
24871 const bool IsLP64 = Subtarget.isTarget64BitLP64();
24873 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
24874 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
24877 // ... [Till the alloca]
24878 // If stacklet is not large enough, jump to mallocMBB
24881 // Allocate by subtracting from RSP
24882 // Jump to continueMBB
24885 // Allocate by call to runtime
24889 // [rest of original BB]
24892 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24893 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24894 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24896 MachineRegisterInfo &MRI = MF->getRegInfo();
24897 const TargetRegisterClass *AddrRegClass =
24898 getRegClassFor(getPointerTy(MF->getDataLayout()));
24900 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
24901 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
24902 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
24903 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
24904 sizeVReg = MI.getOperand(1).getReg(),
24906 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
24908 MachineFunction::iterator MBBIter = ++BB->getIterator();
24910 MF->insert(MBBIter, bumpMBB);
24911 MF->insert(MBBIter, mallocMBB);
24912 MF->insert(MBBIter, continueMBB);
24914 continueMBB->splice(continueMBB->begin(), BB,
24915 std::next(MachineBasicBlock::iterator(MI)), BB->end());
24916 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
24918 // Add code to the main basic block to check if the stack limit has been hit,
24919 // and if so, jump to mallocMBB otherwise to bumpMBB.
24920 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
24921 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
24922 .addReg(tmpSPVReg).addReg(sizeVReg);
24923 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
24924 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
24925 .addReg(SPLimitVReg);
24926 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
24928 // bumpMBB simply decreases the stack pointer, since we know the current
24929 // stacklet has enough space.
24930 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
24931 .addReg(SPLimitVReg);
24932 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
24933 .addReg(SPLimitVReg);
24934 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
24936 // Calls into a routine in libgcc to allocate more space from the heap.
24937 const uint32_t *RegMask =
24938 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
24940 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
24942 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
24943 .addExternalSymbol("__morestack_allocate_stack_space")
24944 .addRegMask(RegMask)
24945 .addReg(X86::RDI, RegState::Implicit)
24946 .addReg(X86::RAX, RegState::ImplicitDefine);
24947 } else if (Is64Bit) {
24948 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
24950 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
24951 .addExternalSymbol("__morestack_allocate_stack_space")
24952 .addRegMask(RegMask)
24953 .addReg(X86::EDI, RegState::Implicit)
24954 .addReg(X86::EAX, RegState::ImplicitDefine);
24956 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
24958 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
24959 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
24960 .addExternalSymbol("__morestack_allocate_stack_space")
24961 .addRegMask(RegMask)
24962 .addReg(X86::EAX, RegState::ImplicitDefine);
24966 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
24969 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
24970 .addReg(IsLP64 ? X86::RAX : X86::EAX);
24971 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
24973 // Set up the CFG correctly.
24974 BB->addSuccessor(bumpMBB);
24975 BB->addSuccessor(mallocMBB);
24976 mallocMBB->addSuccessor(continueMBB);
24977 bumpMBB->addSuccessor(continueMBB);
24979 // Take care of the PHI nodes.
24980 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
24981 MI.getOperand(0).getReg())
24982 .addReg(mallocPtrVReg)
24984 .addReg(bumpSPPtrVReg)
24987 // Delete the original pseudo instruction.
24988 MI.eraseFromParent();
24991 return continueMBB;
24994 MachineBasicBlock *
24995 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
24996 MachineBasicBlock *BB) const {
24997 MachineFunction *MF = BB->getParent();
24998 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
24999 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25000 DebugLoc DL = MI.getDebugLoc();
25002 assert(!isAsynchronousEHPersonality(
25003 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25004 "SEH does not use catchret!");
25006 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25007 if (!Subtarget.is32Bit())
25010 // C++ EH creates a new target block to hold the restore code, and wires up
25011 // the new block to the return destination with a normal JMP_4.
25012 MachineBasicBlock *RestoreMBB =
25013 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25014 assert(BB->succ_size() == 1);
25015 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25016 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25017 BB->addSuccessor(RestoreMBB);
25018 MI.getOperand(0).setMBB(RestoreMBB);
25020 auto RestoreMBBI = RestoreMBB->begin();
25021 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25022 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25026 MachineBasicBlock *
25027 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25028 MachineBasicBlock *BB) const {
25029 MachineFunction *MF = BB->getParent();
25030 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25031 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25032 // Only 32-bit SEH requires special handling for catchpad.
25033 if (IsSEH && Subtarget.is32Bit()) {
25034 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25035 DebugLoc DL = MI.getDebugLoc();
25036 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25038 MI.eraseFromParent();
25042 MachineBasicBlock *
25043 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25044 MachineBasicBlock *BB) const {
25045 // So, here we replace TLSADDR with the sequence:
25046 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25047 // We need this because TLSADDR is lowered into calls
25048 // inside MC, therefore without the two markers shrink-wrapping
25049 // may push the prologue/epilogue pass them.
25050 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25051 DebugLoc DL = MI.getDebugLoc();
25052 MachineFunction &MF = *BB->getParent();
25054 // Emit CALLSEQ_START right before the instruction.
25055 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25056 MachineInstrBuilder CallseqStart =
25057 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25058 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25060 // Emit CALLSEQ_END right after the instruction.
25061 // We don't call erase from parent because we want to keep the
25062 // original instruction around.
25063 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25064 MachineInstrBuilder CallseqEnd =
25065 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25066 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25071 MachineBasicBlock *
25072 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25073 MachineBasicBlock *BB) const {
25074 // This is pretty easy. We're taking the value that we received from
25075 // our load from the relocation, sticking it in either RDI (x86-64)
25076 // or EAX and doing an indirect call. The return value will then
25077 // be in the normal return register.
25078 MachineFunction *F = BB->getParent();
25079 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25080 DebugLoc DL = MI.getDebugLoc();
25082 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25083 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25085 // Get a register mask for the lowered call.
25086 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25087 // proper register mask.
25088 const uint32_t *RegMask =
25089 Subtarget.is64Bit() ?
25090 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25091 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25092 if (Subtarget.is64Bit()) {
25093 MachineInstrBuilder MIB =
25094 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25098 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25099 MI.getOperand(3).getTargetFlags())
25101 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25102 addDirectMem(MIB, X86::RDI);
25103 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25104 } else if (!isPositionIndependent()) {
25105 MachineInstrBuilder MIB =
25106 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25110 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25111 MI.getOperand(3).getTargetFlags())
25113 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25114 addDirectMem(MIB, X86::EAX);
25115 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25117 MachineInstrBuilder MIB =
25118 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25119 .addReg(TII->getGlobalBaseReg(F))
25122 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25123 MI.getOperand(3).getTargetFlags())
25125 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25126 addDirectMem(MIB, X86::EAX);
25127 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25130 MI.eraseFromParent(); // The pseudo instruction is gone now.
25134 MachineBasicBlock *
25135 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25136 MachineBasicBlock *MBB) const {
25137 DebugLoc DL = MI.getDebugLoc();
25138 MachineFunction *MF = MBB->getParent();
25139 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25140 MachineRegisterInfo &MRI = MF->getRegInfo();
25142 const BasicBlock *BB = MBB->getBasicBlock();
25143 MachineFunction::iterator I = ++MBB->getIterator();
25145 // Memory Reference
25146 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25147 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25150 unsigned MemOpndSlot = 0;
25152 unsigned CurOp = 0;
25154 DstReg = MI.getOperand(CurOp++).getReg();
25155 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25156 assert(RC->hasType(MVT::i32) && "Invalid destination!");
25157 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25158 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25160 MemOpndSlot = CurOp;
25162 MVT PVT = getPointerTy(MF->getDataLayout());
25163 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25164 "Invalid Pointer Size!");
25166 // For v = setjmp(buf), we generate
25169 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25170 // SjLjSetup restoreMBB
25176 // v = phi(main, restore)
25179 // if base pointer being used, load it from frame
25182 MachineBasicBlock *thisMBB = MBB;
25183 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25184 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25185 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25186 MF->insert(I, mainMBB);
25187 MF->insert(I, sinkMBB);
25188 MF->push_back(restoreMBB);
25189 restoreMBB->setHasAddressTaken();
25191 MachineInstrBuilder MIB;
25193 // Transfer the remainder of BB and its successor edges to sinkMBB.
25194 sinkMBB->splice(sinkMBB->begin(), MBB,
25195 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25196 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25199 unsigned PtrStoreOpc = 0;
25200 unsigned LabelReg = 0;
25201 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25202 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25203 !isPositionIndependent();
25205 // Prepare IP either in reg or imm.
25206 if (!UseImmLabel) {
25207 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25208 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25209 LabelReg = MRI.createVirtualRegister(PtrRC);
25210 if (Subtarget.is64Bit()) {
25211 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25215 .addMBB(restoreMBB)
25218 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25219 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25220 .addReg(XII->getGlobalBaseReg(MF))
25223 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25227 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25229 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25230 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25231 if (i == X86::AddrDisp)
25232 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25234 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
25237 MIB.addReg(LabelReg);
25239 MIB.addMBB(restoreMBB);
25240 MIB.setMemRefs(MMOBegin, MMOEnd);
25242 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
25243 .addMBB(restoreMBB);
25245 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25246 MIB.addRegMask(RegInfo->getNoPreservedMask());
25247 thisMBB->addSuccessor(mainMBB);
25248 thisMBB->addSuccessor(restoreMBB);
25252 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
25253 mainMBB->addSuccessor(sinkMBB);
25256 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
25257 TII->get(X86::PHI), DstReg)
25258 .addReg(mainDstReg).addMBB(mainMBB)
25259 .addReg(restoreDstReg).addMBB(restoreMBB);
25262 if (RegInfo->hasBasePointer(*MF)) {
25263 const bool Uses64BitFramePtr =
25264 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25265 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
25266 X86FI->setRestoreBasePointer(MF);
25267 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
25268 unsigned BasePtr = RegInfo->getBaseRegister();
25269 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
25270 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
25271 FramePtr, true, X86FI->getRestoreBasePointerOffset())
25272 .setMIFlag(MachineInstr::FrameSetup);
25274 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
25275 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25276 restoreMBB->addSuccessor(sinkMBB);
25278 MI.eraseFromParent();
25282 MachineBasicBlock *
25283 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
25284 MachineBasicBlock *MBB) const {
25285 DebugLoc DL = MI.getDebugLoc();
25286 MachineFunction *MF = MBB->getParent();
25287 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25288 MachineRegisterInfo &MRI = MF->getRegInfo();
25290 // Memory Reference
25291 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25292 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25294 MVT PVT = getPointerTy(MF->getDataLayout());
25295 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25296 "Invalid Pointer Size!");
25298 const TargetRegisterClass *RC =
25299 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25300 unsigned Tmp = MRI.createVirtualRegister(RC);
25301 // Since FP is only updated here but NOT referenced, it's treated as GPR.
25302 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25303 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
25304 unsigned SP = RegInfo->getStackRegister();
25306 MachineInstrBuilder MIB;
25308 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25309 const int64_t SPOffset = 2 * PVT.getStoreSize();
25311 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
25312 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
25315 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
25316 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
25317 MIB.addOperand(MI.getOperand(i));
25318 MIB.setMemRefs(MMOBegin, MMOEnd);
25320 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
25321 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25322 if (i == X86::AddrDisp)
25323 MIB.addDisp(MI.getOperand(i), LabelOffset);
25325 MIB.addOperand(MI.getOperand(i));
25327 MIB.setMemRefs(MMOBegin, MMOEnd);
25329 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
25330 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25331 if (i == X86::AddrDisp)
25332 MIB.addDisp(MI.getOperand(i), SPOffset);
25334 MIB.addOperand(MI.getOperand(i));
25336 MIB.setMemRefs(MMOBegin, MMOEnd);
25338 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
25340 MI.eraseFromParent();
25344 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
25345 MachineBasicBlock *MBB,
25346 MachineBasicBlock *DispatchBB,
25348 DebugLoc DL = MI.getDebugLoc();
25349 MachineFunction *MF = MBB->getParent();
25350 MachineRegisterInfo *MRI = &MF->getRegInfo();
25351 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25353 MVT PVT = getPointerTy(MF->getDataLayout());
25354 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
25359 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25360 !isPositionIndependent();
25363 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25365 const TargetRegisterClass *TRC =
25366 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25367 VR = MRI->createVirtualRegister(TRC);
25368 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25370 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
25372 if (Subtarget.is64Bit())
25373 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
25377 .addMBB(DispatchBB)
25380 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
25381 .addReg(0) /* XII->getGlobalBaseReg(MF) */
25384 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
25388 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
25389 addFrameReference(MIB, FI, 36);
25391 MIB.addMBB(DispatchBB);
25396 MachineBasicBlock *
25397 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
25398 MachineBasicBlock *BB) const {
25399 DebugLoc DL = MI.getDebugLoc();
25400 MachineFunction *MF = BB->getParent();
25401 MachineFrameInfo &MFI = MF->getFrameInfo();
25402 MachineRegisterInfo *MRI = &MF->getRegInfo();
25403 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25404 int FI = MFI.getFunctionContextIndex();
25406 // Get a mapping of the call site numbers to all of the landing pads they're
25407 // associated with.
25408 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
25409 unsigned MaxCSNum = 0;
25410 for (auto &MBB : *MF) {
25411 if (!MBB.isEHPad())
25414 MCSymbol *Sym = nullptr;
25415 for (const auto &MI : MBB) {
25416 if (MI.isDebugValue())
25419 assert(MI.isEHLabel() && "expected EH_LABEL");
25420 Sym = MI.getOperand(0).getMCSymbol();
25424 if (!MF->hasCallSiteLandingPad(Sym))
25427 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
25428 CallSiteNumToLPad[CSI].push_back(&MBB);
25429 MaxCSNum = std::max(MaxCSNum, CSI);
25433 // Get an ordered list of the machine basic blocks for the jump table.
25434 std::vector<MachineBasicBlock *> LPadList;
25435 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
25436 LPadList.reserve(CallSiteNumToLPad.size());
25438 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
25439 for (auto &LP : CallSiteNumToLPad[CSI]) {
25440 LPadList.push_back(LP);
25441 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
25445 assert(!LPadList.empty() &&
25446 "No landing pad destinations for the dispatch jump table!");
25448 // Create the MBBs for the dispatch code.
25450 // Shove the dispatch's address into the return slot in the function context.
25451 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
25452 DispatchBB->setIsEHPad(true);
25454 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
25455 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
25456 DispatchBB->addSuccessor(TrapBB);
25458 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
25459 DispatchBB->addSuccessor(DispContBB);
25462 MF->push_back(DispatchBB);
25463 MF->push_back(DispContBB);
25464 MF->push_back(TrapBB);
25466 // Insert code into the entry block that creates and registers the function
25468 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
25470 // Create the jump table and associated information
25471 MachineJumpTableInfo *JTI =
25472 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
25473 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
25475 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
25476 const X86RegisterInfo &RI = XII->getRegisterInfo();
25478 // Add a register mask with no preserved registers. This results in all
25479 // registers being marked as clobbered.
25480 if (RI.hasBasePointer(*MF)) {
25481 const bool FPIs64Bit =
25482 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25483 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
25484 MFI->setRestoreBasePointer(MF);
25486 unsigned FP = RI.getFrameRegister(*MF);
25487 unsigned BP = RI.getBaseRegister();
25488 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
25489 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
25490 MFI->getRestoreBasePointerOffset())
25491 .addRegMask(RI.getNoPreservedMask());
25493 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
25494 .addRegMask(RI.getNoPreservedMask());
25497 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25498 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
25500 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
25502 .addImm(LPadList.size());
25503 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
25505 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25506 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
25509 BuildMI(DispContBB, DL,
25510 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
25512 .addImm(Subtarget.is64Bit() ? 8 : 4)
25514 .addJumpTableIndex(MJTI)
25517 // Add the jump table entries as successors to the MBB.
25518 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
25519 for (auto &LP : LPadList)
25520 if (SeenMBBs.insert(LP).second)
25521 DispContBB->addSuccessor(LP);
25523 // N.B. the order the invoke BBs are processed in doesn't matter here.
25524 SmallVector<MachineBasicBlock *, 64> MBBLPads;
25525 const MCPhysReg *SavedRegs =
25526 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
25527 for (MachineBasicBlock *MBB : InvokeBBs) {
25528 // Remove the landing pad successor from the invoke block and replace it
25529 // with the new dispatch block.
25530 // Keep a copy of Successors since it's modified inside the loop.
25531 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
25533 // FIXME: Avoid quadratic complexity.
25534 for (auto MBBS : Successors) {
25535 if (MBBS->isEHPad()) {
25536 MBB->removeSuccessor(MBBS);
25537 MBBLPads.push_back(MBBS);
25541 MBB->addSuccessor(DispatchBB);
25543 // Find the invoke call and mark all of the callee-saved registers as
25544 // 'implicit defined' so that they're spilled. This prevents code from
25545 // moving instructions to before the EH block, where they will never be
25547 for (auto &II : reverse(*MBB)) {
25551 DenseMap<unsigned, bool> DefRegs;
25552 for (auto &MOp : II.operands())
25554 DefRegs[MOp.getReg()] = true;
25556 MachineInstrBuilder MIB(*MF, &II);
25557 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
25558 unsigned Reg = SavedRegs[RI];
25560 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
25567 // Mark all former landing pads as non-landing pads. The dispatch is the only
25568 // landing pad now.
25569 for (auto &LP : MBBLPads)
25570 LP->setIsEHPad(false);
25572 // The instruction is gone now.
25573 MI.eraseFromParent();
25577 MachineBasicBlock *
25578 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
25579 MachineBasicBlock *BB) const {
25580 MachineFunction *MF = BB->getParent();
25581 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25582 DebugLoc DL = MI.getDebugLoc();
25584 switch (MI.getOpcode()) {
25585 default: llvm_unreachable("Unexpected instr type to insert");
25586 case X86::TAILJMPd64:
25587 case X86::TAILJMPr64:
25588 case X86::TAILJMPm64:
25589 case X86::TAILJMPr64_REX:
25590 case X86::TAILJMPm64_REX:
25591 llvm_unreachable("TAILJMP64 would not be touched here.");
25592 case X86::TCRETURNdi64:
25593 case X86::TCRETURNri64:
25594 case X86::TCRETURNmi64:
25596 case X86::TLS_addr32:
25597 case X86::TLS_addr64:
25598 case X86::TLS_base_addr32:
25599 case X86::TLS_base_addr64:
25600 return EmitLoweredTLSAddr(MI, BB);
25601 case X86::CATCHRET:
25602 return EmitLoweredCatchRet(MI, BB);
25603 case X86::CATCHPAD:
25604 return EmitLoweredCatchPad(MI, BB);
25605 case X86::SEG_ALLOCA_32:
25606 case X86::SEG_ALLOCA_64:
25607 return EmitLoweredSegAlloca(MI, BB);
25608 case X86::TLSCall_32:
25609 case X86::TLSCall_64:
25610 return EmitLoweredTLSCall(MI, BB);
25611 case X86::CMOV_FR32:
25612 case X86::CMOV_FR64:
25613 case X86::CMOV_FR128:
25614 case X86::CMOV_GR8:
25615 case X86::CMOV_GR16:
25616 case X86::CMOV_GR32:
25617 case X86::CMOV_RFP32:
25618 case X86::CMOV_RFP64:
25619 case X86::CMOV_RFP80:
25620 case X86::CMOV_V2F64:
25621 case X86::CMOV_V2I64:
25622 case X86::CMOV_V4F32:
25623 case X86::CMOV_V4F64:
25624 case X86::CMOV_V4I64:
25625 case X86::CMOV_V16F32:
25626 case X86::CMOV_V8F32:
25627 case X86::CMOV_V8F64:
25628 case X86::CMOV_V8I64:
25629 case X86::CMOV_V8I1:
25630 case X86::CMOV_V16I1:
25631 case X86::CMOV_V32I1:
25632 case X86::CMOV_V64I1:
25633 return EmitLoweredSelect(MI, BB);
25635 case X86::RDFLAGS32:
25636 case X86::RDFLAGS64: {
25638 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
25639 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
25640 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
25641 // Permit reads of the FLAGS register without it being defined.
25642 // This intrinsic exists to read external processor state in flags, such as
25643 // the trap flag, interrupt flag, and direction flag, none of which are
25644 // modeled by the backend.
25645 Push->getOperand(2).setIsUndef();
25646 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
25648 MI.eraseFromParent(); // The pseudo is gone now.
25652 case X86::WRFLAGS32:
25653 case X86::WRFLAGS64: {
25655 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
25657 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
25658 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
25659 BuildMI(*BB, MI, DL, TII->get(PopF));
25661 MI.eraseFromParent(); // The pseudo is gone now.
25665 case X86::RELEASE_FADD32mr:
25666 case X86::RELEASE_FADD64mr:
25667 return EmitLoweredAtomicFP(MI, BB);
25669 case X86::FP32_TO_INT16_IN_MEM:
25670 case X86::FP32_TO_INT32_IN_MEM:
25671 case X86::FP32_TO_INT64_IN_MEM:
25672 case X86::FP64_TO_INT16_IN_MEM:
25673 case X86::FP64_TO_INT32_IN_MEM:
25674 case X86::FP64_TO_INT64_IN_MEM:
25675 case X86::FP80_TO_INT16_IN_MEM:
25676 case X86::FP80_TO_INT32_IN_MEM:
25677 case X86::FP80_TO_INT64_IN_MEM: {
25678 // Change the floating point control register to use "round towards zero"
25679 // mode when truncating to an integer value.
25680 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
25681 addFrameReference(BuildMI(*BB, MI, DL,
25682 TII->get(X86::FNSTCW16m)), CWFrameIdx);
25684 // Load the old value of the high byte of the control word...
25686 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
25687 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
25690 // Set the high part to be round to zero...
25691 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
25694 // Reload the modified control word now...
25695 addFrameReference(BuildMI(*BB, MI, DL,
25696 TII->get(X86::FLDCW16m)), CWFrameIdx);
25698 // Restore the memory image of control word to original value
25699 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
25702 // Get the X86 opcode to use.
25704 switch (MI.getOpcode()) {
25705 default: llvm_unreachable("illegal opcode!");
25706 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
25707 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
25708 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
25709 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
25710 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
25711 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
25712 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
25713 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
25714 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
25717 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25718 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
25719 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
25721 // Reload the original control word now.
25722 addFrameReference(BuildMI(*BB, MI, DL,
25723 TII->get(X86::FLDCW16m)), CWFrameIdx);
25725 MI.eraseFromParent(); // The pseudo instruction is gone now.
25728 // String/text processing lowering.
25729 case X86::PCMPISTRM128REG:
25730 case X86::VPCMPISTRM128REG:
25731 case X86::PCMPISTRM128MEM:
25732 case X86::VPCMPISTRM128MEM:
25733 case X86::PCMPESTRM128REG:
25734 case X86::VPCMPESTRM128REG:
25735 case X86::PCMPESTRM128MEM:
25736 case X86::VPCMPESTRM128MEM:
25737 assert(Subtarget.hasSSE42() &&
25738 "Target must have SSE4.2 or AVX features enabled");
25739 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
25741 // String/text processing lowering.
25742 case X86::PCMPISTRIREG:
25743 case X86::VPCMPISTRIREG:
25744 case X86::PCMPISTRIMEM:
25745 case X86::VPCMPISTRIMEM:
25746 case X86::PCMPESTRIREG:
25747 case X86::VPCMPESTRIREG:
25748 case X86::PCMPESTRIMEM:
25749 case X86::VPCMPESTRIMEM:
25750 assert(Subtarget.hasSSE42() &&
25751 "Target must have SSE4.2 or AVX features enabled");
25752 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
25754 // Thread synchronization.
25756 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
25757 case X86::MONITORX:
25758 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
25761 return emitWRPKRU(MI, BB, Subtarget);
25763 return emitRDPKRU(MI, BB, Subtarget);
25766 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
25768 case X86::VASTART_SAVE_XMM_REGS:
25769 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
25771 case X86::VAARG_64:
25772 return EmitVAARG64WithCustomInserter(MI, BB);
25774 case X86::EH_SjLj_SetJmp32:
25775 case X86::EH_SjLj_SetJmp64:
25776 return emitEHSjLjSetJmp(MI, BB);
25778 case X86::EH_SjLj_LongJmp32:
25779 case X86::EH_SjLj_LongJmp64:
25780 return emitEHSjLjLongJmp(MI, BB);
25782 case X86::Int_eh_sjlj_setup_dispatch:
25783 return EmitSjLjDispatchBlock(MI, BB);
25785 case TargetOpcode::STATEPOINT:
25786 // As an implementation detail, STATEPOINT shares the STACKMAP format at
25787 // this point in the process. We diverge later.
25788 return emitPatchPoint(MI, BB);
25790 case TargetOpcode::STACKMAP:
25791 case TargetOpcode::PATCHPOINT:
25792 return emitPatchPoint(MI, BB);
25794 case X86::LCMPXCHG8B: {
25795 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
25796 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
25797 // requires a memory operand. If it happens that current architecture is
25798 // i686 and for current function we need a base pointer
25799 // - which is ESI for i686 - register allocator would not be able to
25800 // allocate registers for an address in form of X(%reg, %reg, Y)
25801 // - there never would be enough unreserved registers during regalloc
25802 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
25803 // We are giving a hand to register allocator by precomputing the address in
25804 // a new vreg using LEA.
25806 // If it is not i686 or there is no base pointer - nothing to do here.
25807 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
25810 // Even though this code does not necessarily needs the base pointer to
25811 // be ESI, we check for that. The reason: if this assert fails, there are
25812 // some changes happened in the compiler base pointer handling, which most
25813 // probably have to be addressed somehow here.
25814 assert(TRI->getBaseRegister() == X86::ESI &&
25815 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
25816 "base pointer in mind");
25818 MachineRegisterInfo &MRI = MF->getRegInfo();
25819 MVT SPTy = getPointerTy(MF->getDataLayout());
25820 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
25821 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
25823 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25824 // Regalloc does not need any help when the memory operand of CMPXCHG8B
25825 // does not use index register.
25826 if (AM.IndexReg == X86::NoRegister)
25829 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
25830 // four operand definitions that are E[ABCD] registers. We skip them and
25831 // then insert the LEA.
25832 MachineBasicBlock::iterator MBBI(MI);
25833 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
25834 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
25837 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
25839 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
25843 case X86::LCMPXCHG16B:
25845 case X86::LCMPXCHG8B_SAVE_EBX:
25846 case X86::LCMPXCHG16B_SAVE_RBX: {
25848 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
25849 if (!BB->isLiveIn(BasePtr))
25850 BB->addLiveIn(BasePtr);
25856 //===----------------------------------------------------------------------===//
25857 // X86 Optimization Hooks
25858 //===----------------------------------------------------------------------===//
25860 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
25863 const SelectionDAG &DAG,
25864 unsigned Depth) const {
25865 unsigned BitWidth = KnownZero.getBitWidth();
25866 unsigned Opc = Op.getOpcode();
25867 assert((Opc >= ISD::BUILTIN_OP_END ||
25868 Opc == ISD::INTRINSIC_WO_CHAIN ||
25869 Opc == ISD::INTRINSIC_W_CHAIN ||
25870 Opc == ISD::INTRINSIC_VOID) &&
25871 "Should use MaskedValueIsZero if you don't know whether Op"
25872 " is a target node!");
25874 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
25888 // These nodes' second result is a boolean.
25889 if (Op.getResNo() == 0)
25892 case X86ISD::SETCC:
25893 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
25895 case X86ISD::MOVMSK: {
25896 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
25897 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
25900 case X86ISD::VZEXT: {
25901 SDValue N0 = Op.getOperand(0);
25902 unsigned NumElts = Op.getValueType().getVectorNumElements();
25903 unsigned InNumElts = N0.getValueType().getVectorNumElements();
25904 unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
25906 KnownZero = KnownOne = APInt(InBitWidth, 0);
25907 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
25908 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
25909 KnownOne = KnownOne.zext(BitWidth);
25910 KnownZero = KnownZero.zext(BitWidth);
25911 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
25917 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
25918 SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
25919 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
25920 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
25921 return Op.getScalarValueSizeInBits();
25923 if (Op.getOpcode() == X86ISD::VSEXT) {
25924 EVT VT = Op.getValueType();
25925 EVT SrcVT = Op.getOperand(0).getValueType();
25926 unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
25927 Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
25935 /// Returns true (and the GlobalValue and the offset) if the node is a
25936 /// GlobalAddress + offset.
25937 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
25938 const GlobalValue* &GA,
25939 int64_t &Offset) const {
25940 if (N->getOpcode() == X86ISD::Wrapper) {
25941 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
25942 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
25943 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
25947 return TargetLowering::isGAPlusOffset(N, GA, Offset);
25950 // Attempt to match a combined shuffle mask against supported unary shuffle
25952 // TODO: Investigate sharing more of this with shuffle lowering.
25953 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
25954 const X86Subtarget &Subtarget,
25955 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
25956 unsigned NumMaskElts = Mask.size();
25957 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
25958 bool FloatDomain = MaskVT.isFloatingPoint() ||
25959 (!Subtarget.hasAVX2() && MaskVT.is256BitVector());
25961 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
25962 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
25963 isUndefOrEqual(Mask[0], 0) &&
25964 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
25965 Shuffle = X86ISD::VZEXT_MOVL;
25966 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
25970 // Match against a VZEXT instruction.
25971 // TODO: Add 256/512-bit vector support.
25972 if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
25973 unsigned MaxScale = 64 / MaskEltSize;
25974 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
25976 unsigned NumDstElts = NumMaskElts / Scale;
25977 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
25978 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
25979 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
25983 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
25984 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
25985 Shuffle = X86ISD::VZEXT;
25991 // Check if we have SSE3 which will let us use MOVDDUP etc. The
25992 // instructions are no slower than UNPCKLPD but has the option to
25993 // fold the input operand into even an unaligned memory load.
25994 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
25995 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
25996 Shuffle = X86ISD::MOVDDUP;
25997 SrcVT = DstVT = MVT::v2f64;
26000 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26001 Shuffle = X86ISD::MOVSLDUP;
26002 SrcVT = DstVT = MVT::v4f32;
26005 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26006 Shuffle = X86ISD::MOVSHDUP;
26007 SrcVT = DstVT = MVT::v4f32;
26012 if (MaskVT.is256BitVector() && FloatDomain) {
26013 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26014 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26015 Shuffle = X86ISD::MOVDDUP;
26016 SrcVT = DstVT = MVT::v4f64;
26019 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26020 Shuffle = X86ISD::MOVSLDUP;
26021 SrcVT = DstVT = MVT::v8f32;
26024 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26025 Shuffle = X86ISD::MOVSHDUP;
26026 SrcVT = DstVT = MVT::v8f32;
26031 if (MaskVT.is512BitVector() && FloatDomain) {
26032 assert(Subtarget.hasAVX512() &&
26033 "AVX512 required for 512-bit vector shuffles");
26034 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26035 Shuffle = X86ISD::MOVDDUP;
26036 SrcVT = DstVT = MVT::v8f64;
26039 if (isTargetShuffleEquivalent(
26040 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26041 Shuffle = X86ISD::MOVSLDUP;
26042 SrcVT = DstVT = MVT::v16f32;
26045 if (isTargetShuffleEquivalent(
26046 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26047 Shuffle = X86ISD::MOVSHDUP;
26048 SrcVT = DstVT = MVT::v16f32;
26053 // Attempt to match against broadcast-from-vector.
26054 if (Subtarget.hasAVX2()) {
26055 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26056 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26057 SrcVT = DstVT = MaskVT;
26058 Shuffle = X86ISD::VBROADCAST;
26066 // Attempt to match a combined shuffle mask against supported unary immediate
26067 // permute instructions.
26068 // TODO: Investigate sharing more of this with shuffle lowering.
26069 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26070 const X86Subtarget &Subtarget,
26071 unsigned &Shuffle, MVT &ShuffleVT,
26072 unsigned &PermuteImm) {
26073 unsigned NumMaskElts = Mask.size();
26074 bool FloatDomain = MaskVT.isFloatingPoint();
26076 bool ContainsZeros = false;
26077 SmallBitVector Zeroable(NumMaskElts, false);
26078 for (unsigned i = 0; i != NumMaskElts; ++i) {
26080 Zeroable[i] = isUndefOrZero(M);
26081 ContainsZeros |= (M == SM_SentinelZero);
26084 // Attempt to match against byte/bit shifts.
26085 // FIXME: Add 512-bit support.
26086 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26087 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26088 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26089 MaskVT.getScalarSizeInBits(), Mask,
26090 0, Zeroable, Subtarget);
26091 if (0 < ShiftAmt) {
26092 PermuteImm = (unsigned)ShiftAmt;
26097 // Ensure we don't contain any zero elements.
26101 assert(llvm::all_of(Mask, [&](int M) {
26102 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26103 }) && "Expected unary shuffle");
26105 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26106 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26107 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26109 // Handle PSHUFLW/PSHUFHW repeated patterns.
26110 if (MaskScalarSizeInBits == 16) {
26111 SmallVector<int, 4> RepeatedMask;
26112 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26113 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26114 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26116 // PSHUFLW: permute lower 4 elements only.
26117 if (isUndefOrInRange(LoMask, 0, 4) &&
26118 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26119 Shuffle = X86ISD::PSHUFLW;
26120 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26121 PermuteImm = getV4X86ShuffleImm(LoMask);
26125 // PSHUFHW: permute upper 4 elements only.
26126 if (isUndefOrInRange(HiMask, 4, 8) &&
26127 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26128 // Offset the HiMask so that we can create the shuffle immediate.
26129 int OffsetHiMask[4];
26130 for (int i = 0; i != 4; ++i)
26131 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26133 Shuffle = X86ISD::PSHUFHW;
26134 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26135 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26144 // We only support permutation of 32/64 bit elements after this.
26145 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26148 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26149 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26150 if (FloatDomain && !Subtarget.hasAVX())
26153 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26154 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
26155 FloatDomain = true;
26157 // Check for lane crossing permutes.
26158 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26159 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26160 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26161 Shuffle = X86ISD::VPERMI;
26162 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26163 PermuteImm = getV4X86ShuffleImm(Mask);
26166 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26167 SmallVector<int, 4> RepeatedMask;
26168 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26169 Shuffle = X86ISD::VPERMI;
26170 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
26171 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26178 // VPERMILPD can permute with a non-repeating shuffle.
26179 if (FloatDomain && MaskScalarSizeInBits == 64) {
26180 Shuffle = X86ISD::VPERMILPI;
26181 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
26183 for (int i = 0, e = Mask.size(); i != e; ++i) {
26185 if (M == SM_SentinelUndef)
26187 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
26188 PermuteImm |= (M & 1) << i;
26193 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
26194 SmallVector<int, 4> RepeatedMask;
26195 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
26198 // Narrow the repeated mask for 32-bit element permutes.
26199 SmallVector<int, 4> WordMask = RepeatedMask;
26200 if (MaskScalarSizeInBits == 64)
26201 scaleShuffleMask(2, RepeatedMask, WordMask);
26203 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
26204 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
26205 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
26206 PermuteImm = getV4X86ShuffleImm(WordMask);
26210 // Attempt to match a combined unary shuffle mask against supported binary
26211 // shuffle instructions.
26212 // TODO: Investigate sharing more of this with shuffle lowering.
26213 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26214 SDValue &V1, SDValue &V2,
26215 const X86Subtarget &Subtarget,
26216 unsigned &Shuffle, MVT &ShuffleVT,
26218 bool FloatDomain = MaskVT.isFloatingPoint();
26219 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
26221 if (MaskVT.is128BitVector()) {
26222 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
26224 Shuffle = X86ISD::MOVLHPS;
26225 ShuffleVT = MVT::v4f32;
26228 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
26230 Shuffle = X86ISD::MOVHLPS;
26231 ShuffleVT = MVT::v4f32;
26234 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
26235 (FloatDomain || !Subtarget.hasSSE41())) {
26237 Shuffle = X86ISD::MOVSD;
26238 ShuffleVT = MaskVT;
26241 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
26242 (FloatDomain || !Subtarget.hasSSE41())) {
26243 Shuffle = X86ISD::MOVSS;
26244 ShuffleVT = MaskVT;
26249 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
26250 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26251 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26252 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
26253 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
26254 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
26255 MVT LegalVT = MaskVT;
26256 if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
26257 LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
26259 SmallVector<int, 64> Unpckl, Unpckh;
26261 createUnpackShuffleMask(MaskVT, Unpckl, true, true);
26262 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26264 Shuffle = X86ISD::UNPCKL;
26265 ShuffleVT = LegalVT;
26269 createUnpackShuffleMask(MaskVT, Unpckh, false, true);
26270 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26272 Shuffle = X86ISD::UNPCKH;
26273 ShuffleVT = LegalVT;
26277 createUnpackShuffleMask(MaskVT, Unpckl, true, false);
26278 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26279 Shuffle = X86ISD::UNPCKL;
26280 ShuffleVT = LegalVT;
26284 createUnpackShuffleMask(MaskVT, Unpckh, false, false);
26285 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26286 Shuffle = X86ISD::UNPCKH;
26287 ShuffleVT = LegalVT;
26291 ShuffleVectorSDNode::commuteMask(Unpckl);
26292 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26294 Shuffle = X86ISD::UNPCKL;
26295 ShuffleVT = LegalVT;
26299 ShuffleVectorSDNode::commuteMask(Unpckh);
26300 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26302 Shuffle = X86ISD::UNPCKH;
26303 ShuffleVT = LegalVT;
26312 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26313 SDValue &V1, SDValue &V2,
26314 SDLoc &DL, SelectionDAG &DAG,
26315 const X86Subtarget &Subtarget,
26316 unsigned &Shuffle, MVT &ShuffleVT,
26317 unsigned &PermuteImm) {
26318 unsigned NumMaskElts = Mask.size();
26319 bool FloatDomain = MaskVT.isFloatingPoint();
26321 // Attempt to match against PALIGNR byte rotate.
26322 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26323 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26324 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
26325 if (0 < ByteRotation) {
26326 Shuffle = X86ISD::PALIGNR;
26327 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
26328 PermuteImm = ByteRotation;
26333 // Attempt to combine to X86ISD::BLENDI.
26334 if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
26335 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
26336 // Determine a type compatible with X86ISD::BLENDI.
26337 // TODO - add 16i16 support (requires lane duplication).
26338 MVT BlendVT = MaskVT;
26339 if (Subtarget.hasAVX2()) {
26340 if (BlendVT == MVT::v4i64)
26341 BlendVT = MVT::v8i32;
26342 else if (BlendVT == MVT::v2i64)
26343 BlendVT = MVT::v4i32;
26345 if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
26346 BlendVT = MVT::v8i16;
26347 else if (BlendVT == MVT::v4i64)
26348 BlendVT = MVT::v4f64;
26349 else if (BlendVT == MVT::v8i32)
26350 BlendVT = MVT::v8f32;
26353 unsigned BlendSize = BlendVT.getVectorNumElements();
26354 unsigned MaskRatio = BlendSize / NumMaskElts;
26356 // Can we blend with zero?
26357 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
26359 NumMaskElts <= BlendVT.getVectorNumElements()) {
26361 for (unsigned i = 0; i != BlendSize; ++i)
26362 if (Mask[i / MaskRatio] < 0)
26363 PermuteImm |= 1u << i;
26365 V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
26366 Shuffle = X86ISD::BLENDI;
26367 ShuffleVT = BlendVT;
26371 // Attempt to match as a binary blend.
26372 if (NumMaskElts <= BlendVT.getVectorNumElements()) {
26373 bool MatchBlend = true;
26374 for (int i = 0; i != (int)NumMaskElts; ++i) {
26376 if (M == SM_SentinelUndef)
26378 else if (M == SM_SentinelZero)
26379 MatchBlend = false;
26380 else if ((M != i) && (M != (i + (int)NumMaskElts)))
26381 MatchBlend = false;
26386 for (unsigned i = 0; i != BlendSize; ++i)
26387 if ((int)NumMaskElts <= Mask[i / MaskRatio])
26388 PermuteImm |= 1u << i;
26390 Shuffle = X86ISD::BLENDI;
26391 ShuffleVT = BlendVT;
26397 // Attempt to combine to INSERTPS.
26398 if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
26399 SmallBitVector Zeroable(4, false);
26400 for (unsigned i = 0; i != NumMaskElts; ++i)
26402 Zeroable[i] = true;
26404 if (Zeroable.any() &&
26405 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
26406 Shuffle = X86ISD::INSERTPS;
26407 ShuffleVT = MVT::v4f32;
26412 // Attempt to combine to SHUFPD.
26413 if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
26414 (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
26415 (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
26416 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
26417 Shuffle = X86ISD::SHUFP;
26418 ShuffleVT = MaskVT;
26423 // Attempt to combine to SHUFPS.
26424 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26425 (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26426 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
26427 SmallVector<int, 4> RepeatedMask;
26428 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
26429 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
26430 int M0 = RepeatedMask[Offset];
26431 int M1 = RepeatedMask[Offset + 1];
26433 if (isUndefInRange(RepeatedMask, Offset, 2)) {
26434 return DAG.getUNDEF(MaskVT);
26435 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
26436 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
26437 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
26438 return getZeroVector(MaskVT, Subtarget, DAG, DL);
26439 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
26440 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26441 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26443 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
26444 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26445 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26452 int ShufMask[4] = {-1, -1, -1, -1};
26453 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
26454 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
26459 Shuffle = X86ISD::SHUFP;
26460 ShuffleVT = MaskVT;
26461 PermuteImm = getV4X86ShuffleImm(ShufMask);
26470 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
26473 /// This is the leaf of the recursive combine below. When we have found some
26474 /// chain of single-use x86 shuffle instructions and accumulated the combined
26475 /// shuffle mask represented by them, this will try to pattern match that mask
26476 /// into either a single instruction if there is a special purpose instruction
26477 /// for this operation, or into a PSHUFB instruction which is a fully general
26478 /// instruction but should only be used to replace chains over a certain depth.
26479 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
26480 ArrayRef<int> BaseMask, int Depth,
26481 bool HasVariableMask, SelectionDAG &DAG,
26482 TargetLowering::DAGCombinerInfo &DCI,
26483 const X86Subtarget &Subtarget) {
26484 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
26485 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
26486 "Unexpected number of shuffle inputs!");
26488 // Find the inputs that enter the chain. Note that multiple uses are OK
26489 // here, we're not going to remove the operands we find.
26490 bool UnaryShuffle = (Inputs.size() == 1);
26491 SDValue V1 = peekThroughBitcasts(Inputs[0]);
26492 SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
26494 MVT VT1 = V1.getSimpleValueType();
26495 MVT VT2 = V2.getSimpleValueType();
26496 MVT RootVT = Root.getSimpleValueType();
26497 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
26498 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
26499 "Vector size mismatch");
26504 unsigned NumBaseMaskElts = BaseMask.size();
26505 if (NumBaseMaskElts == 1) {
26506 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
26507 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26512 unsigned RootSizeInBits = RootVT.getSizeInBits();
26513 unsigned NumRootElts = RootVT.getVectorNumElements();
26514 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
26515 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
26516 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
26518 // Don't combine if we are a AVX512/EVEX target and the mask element size
26519 // is different from the root element size - this would prevent writemasks
26520 // from being reused.
26521 // TODO - this currently prevents all lane shuffles from occurring.
26522 // TODO - check for writemasks usage instead of always preventing combining.
26523 // TODO - attempt to narrow Mask back to writemask size.
26524 bool IsEVEXShuffle =
26525 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
26526 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
26529 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
26531 // Handle 128-bit lane shuffles of 256-bit vectors.
26532 // TODO - this should support binary shuffles.
26533 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
26534 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
26535 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
26536 return false; // Nothing to do!
26537 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26538 unsigned PermMask = 0;
26539 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
26540 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
26542 Res = DAG.getBitcast(ShuffleVT, V1);
26543 DCI.AddToWorklist(Res.getNode());
26544 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
26545 DAG.getUNDEF(ShuffleVT),
26546 DAG.getConstant(PermMask, DL, MVT::i8));
26547 DCI.AddToWorklist(Res.getNode());
26548 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26553 // For masks that have been widened to 128-bit elements or more,
26554 // narrow back down to 64-bit elements.
26555 SmallVector<int, 64> Mask;
26556 if (BaseMaskEltSizeInBits > 64) {
26557 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
26558 int MaskScale = BaseMaskEltSizeInBits / 64;
26559 scaleShuffleMask(MaskScale, BaseMask, Mask);
26561 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
26564 unsigned NumMaskElts = Mask.size();
26565 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
26567 // Determine the effective mask value type.
26568 FloatDomain &= (32 <= MaskEltSizeInBits);
26569 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
26570 : MVT::getIntegerVT(MaskEltSizeInBits);
26571 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
26573 // Only allow legal mask types.
26574 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
26577 // Attempt to match the mask against known shuffle patterns.
26578 MVT ShuffleSrcVT, ShuffleVT;
26579 unsigned Shuffle, PermuteImm;
26581 if (UnaryShuffle) {
26582 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
26583 // directly if we don't shuffle the lower element and we shuffle the upper
26584 // (zero) elements within themselves.
26585 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
26586 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
26587 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
26588 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
26589 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
26590 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
26591 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26597 if (matchUnaryVectorShuffle(MaskVT, Mask, Subtarget, Shuffle, ShuffleSrcVT,
26599 if (Depth == 1 && Root.getOpcode() == Shuffle)
26600 return false; // Nothing to do!
26601 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26602 return false; // AVX512 Writemask clash.
26603 Res = DAG.getBitcast(ShuffleSrcVT, V1);
26604 DCI.AddToWorklist(Res.getNode());
26605 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
26606 DCI.AddToWorklist(Res.getNode());
26607 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26612 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Subtarget, Shuffle,
26613 ShuffleVT, PermuteImm)) {
26614 if (Depth == 1 && Root.getOpcode() == Shuffle)
26615 return false; // Nothing to do!
26616 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26617 return false; // AVX512 Writemask clash.
26618 Res = DAG.getBitcast(ShuffleVT, V1);
26619 DCI.AddToWorklist(Res.getNode());
26620 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
26621 DAG.getConstant(PermuteImm, DL, MVT::i8));
26622 DCI.AddToWorklist(Res.getNode());
26623 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26629 if (matchBinaryVectorShuffle(MaskVT, Mask, V1, V2, Subtarget, Shuffle,
26630 ShuffleVT, UnaryShuffle)) {
26631 if (Depth == 1 && Root.getOpcode() == Shuffle)
26632 return false; // Nothing to do!
26633 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26634 return false; // AVX512 Writemask clash.
26635 V1 = DAG.getBitcast(ShuffleVT, V1);
26636 DCI.AddToWorklist(V1.getNode());
26637 V2 = DAG.getBitcast(ShuffleVT, V2);
26638 DCI.AddToWorklist(V2.getNode());
26639 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
26640 DCI.AddToWorklist(Res.getNode());
26641 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26646 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, V1, V2, DL, DAG, Subtarget,
26647 Shuffle, ShuffleVT, PermuteImm)) {
26648 if (Depth == 1 && Root.getOpcode() == Shuffle)
26649 return false; // Nothing to do!
26650 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26651 return false; // AVX512 Writemask clash.
26652 V1 = DAG.getBitcast(ShuffleVT, V1);
26653 DCI.AddToWorklist(V1.getNode());
26654 V2 = DAG.getBitcast(ShuffleVT, V2);
26655 DCI.AddToWorklist(V2.getNode());
26656 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
26657 DAG.getConstant(PermuteImm, DL, MVT::i8));
26658 DCI.AddToWorklist(Res.getNode());
26659 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26664 // Don't try to re-form single instruction chains under any circumstances now
26665 // that we've done encoding canonicalization for them.
26669 bool MaskContainsZeros =
26670 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
26672 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
26673 // If we have a single input lane-crossing shuffle then lower to VPERMV.
26674 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26675 ((Subtarget.hasAVX2() &&
26676 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26677 (Subtarget.hasAVX512() &&
26678 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26679 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26680 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26681 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26682 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26683 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26684 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26685 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26686 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26687 DCI.AddToWorklist(VPermMask.getNode());
26688 Res = DAG.getBitcast(MaskVT, V1);
26689 DCI.AddToWorklist(Res.getNode());
26690 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
26691 DCI.AddToWorklist(Res.getNode());
26692 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26697 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
26698 // vector as the second source.
26699 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26700 ((Subtarget.hasAVX512() &&
26701 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26702 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26703 (Subtarget.hasVLX() &&
26704 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26705 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26706 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26707 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26708 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26709 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26710 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
26711 for (unsigned i = 0; i != NumMaskElts; ++i)
26712 if (Mask[i] == SM_SentinelZero)
26713 Mask[i] = NumMaskElts + i;
26715 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26716 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26717 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26718 DCI.AddToWorklist(VPermMask.getNode());
26719 Res = DAG.getBitcast(MaskVT, V1);
26720 DCI.AddToWorklist(Res.getNode());
26721 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
26722 DCI.AddToWorklist(Zero.getNode());
26723 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
26724 DCI.AddToWorklist(Res.getNode());
26725 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26730 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
26731 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26732 ((Subtarget.hasAVX512() &&
26733 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26734 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26735 (Subtarget.hasVLX() &&
26736 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26737 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26738 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26739 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26740 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26741 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26742 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26743 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26744 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26745 DCI.AddToWorklist(VPermMask.getNode());
26746 V1 = DAG.getBitcast(MaskVT, V1);
26747 DCI.AddToWorklist(V1.getNode());
26748 V2 = DAG.getBitcast(MaskVT, V2);
26749 DCI.AddToWorklist(V2.getNode());
26750 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
26751 DCI.AddToWorklist(Res.getNode());
26752 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26759 // See if we can combine a single input shuffle with zeros to a bit-mask,
26760 // which is much simpler than any shuffle.
26761 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
26762 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
26763 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
26764 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
26765 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
26766 SmallBitVector UndefElts(NumMaskElts, false);
26767 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
26768 for (unsigned i = 0; i != NumMaskElts; ++i) {
26770 if (M == SM_SentinelUndef) {
26771 UndefElts[i] = true;
26774 if (M == SM_SentinelZero)
26776 EltBits[i] = AllOnes;
26778 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
26779 DCI.AddToWorklist(BitMask.getNode());
26780 Res = DAG.getBitcast(MaskVT, V1);
26781 DCI.AddToWorklist(Res.getNode());
26782 unsigned AndOpcode =
26783 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
26784 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
26785 DCI.AddToWorklist(Res.getNode());
26786 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26791 // If we have a single input shuffle with different shuffle patterns in the
26792 // the 128-bit lanes use the variable mask to VPERMILPS.
26793 // TODO Combine other mask types at higher depths.
26794 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
26795 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26796 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
26797 SmallVector<SDValue, 16> VPermIdx;
26798 for (int M : Mask) {
26800 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
26801 VPermIdx.push_back(Idx);
26803 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
26804 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
26805 DCI.AddToWorklist(VPermMask.getNode());
26806 Res = DAG.getBitcast(MaskVT, V1);
26807 DCI.AddToWorklist(Res.getNode());
26808 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
26809 DCI.AddToWorklist(Res.getNode());
26810 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26815 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
26816 // to VPERMIL2PD/VPERMIL2PS.
26817 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
26818 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
26819 MaskVT == MVT::v8f32)) {
26820 // VPERMIL2 Operation.
26821 // Bits[3] - Match Bit.
26822 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
26823 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
26824 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
26825 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
26826 SmallVector<int, 8> VPerm2Idx;
26827 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
26828 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
26829 unsigned M2ZImm = 0;
26830 for (int M : Mask) {
26831 if (M == SM_SentinelUndef) {
26832 VPerm2Idx.push_back(-1);
26835 if (M == SM_SentinelZero) {
26837 VPerm2Idx.push_back(8);
26840 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
26841 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
26842 VPerm2Idx.push_back(Index);
26844 V1 = DAG.getBitcast(MaskVT, V1);
26845 DCI.AddToWorklist(V1.getNode());
26846 V2 = DAG.getBitcast(MaskVT, V2);
26847 DCI.AddToWorklist(V2.getNode());
26848 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
26849 DCI.AddToWorklist(VPerm2MaskOp.getNode());
26850 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
26851 DAG.getConstant(M2ZImm, DL, MVT::i8));
26852 DCI.AddToWorklist(Res.getNode());
26853 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26858 // If we have 3 or more shuffle instructions or a chain involving a variable
26859 // mask, we can replace them with a single PSHUFB instruction profitably.
26860 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
26861 // instructions, but in practice PSHUFB tends to be *very* fast so we're
26862 // more aggressive.
26863 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26864 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26865 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
26866 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
26867 SmallVector<SDValue, 16> PSHUFBMask;
26868 int NumBytes = RootVT.getSizeInBits() / 8;
26869 int Ratio = NumBytes / NumMaskElts;
26870 for (int i = 0; i < NumBytes; ++i) {
26871 int M = Mask[i / Ratio];
26872 if (M == SM_SentinelUndef) {
26873 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
26876 if (M == SM_SentinelZero) {
26877 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
26880 M = Ratio * M + i % Ratio;
26881 assert ((M / 16) == (i / 16) && "Lane crossing detected");
26882 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26884 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
26885 Res = DAG.getBitcast(ByteVT, V1);
26886 DCI.AddToWorklist(Res.getNode());
26887 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
26888 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
26889 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
26890 DCI.AddToWorklist(Res.getNode());
26891 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26896 // With XOP, if we have a 128-bit binary input shuffle we can always combine
26897 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
26898 // slower than PSHUFB on targets that support both.
26899 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
26900 Subtarget.hasXOP()) {
26901 // VPPERM Mask Operation
26902 // Bits[4:0] - Byte Index (0 - 31)
26903 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
26904 SmallVector<SDValue, 16> VPPERMMask;
26906 int Ratio = NumBytes / NumMaskElts;
26907 for (int i = 0; i < NumBytes; ++i) {
26908 int M = Mask[i / Ratio];
26909 if (M == SM_SentinelUndef) {
26910 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
26913 if (M == SM_SentinelZero) {
26914 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
26917 M = Ratio * M + i % Ratio;
26918 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
26920 MVT ByteVT = MVT::v16i8;
26921 V1 = DAG.getBitcast(ByteVT, V1);
26922 DCI.AddToWorklist(V1.getNode());
26923 V2 = DAG.getBitcast(ByteVT, V2);
26924 DCI.AddToWorklist(V2.getNode());
26925 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
26926 DCI.AddToWorklist(VPPERMMaskOp.getNode());
26927 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
26928 DCI.AddToWorklist(Res.getNode());
26929 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26934 // Failed to find any combines.
26938 // Attempt to constant fold all of the constant source ops.
26939 // Returns true if the entire shuffle is folded to a constant.
26940 // TODO: Extend this to merge multiple constant Ops and update the mask.
26941 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
26942 ArrayRef<int> Mask, SDValue Root,
26943 bool HasVariableMask, SelectionDAG &DAG,
26944 TargetLowering::DAGCombinerInfo &DCI,
26945 const X86Subtarget &Subtarget) {
26946 MVT VT = Root.getSimpleValueType();
26948 unsigned SizeInBits = VT.getSizeInBits();
26949 unsigned NumMaskElts = Mask.size();
26950 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
26951 unsigned NumOps = Ops.size();
26953 // Extract constant bits from each source op.
26954 bool OneUseConstantOp = false;
26955 SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
26956 SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
26957 for (unsigned i = 0; i != NumOps; ++i) {
26958 SDValue SrcOp = Ops[i];
26959 OneUseConstantOp |= SrcOp.hasOneUse();
26960 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
26965 // Only fold if at least one of the constants is only used once or
26966 // the combined shuffle has included a variable mask shuffle, this
26967 // is to avoid constant pool bloat.
26968 if (!OneUseConstantOp && !HasVariableMask)
26971 // Shuffle the constant bits according to the mask.
26972 SmallBitVector UndefElts(NumMaskElts, false);
26973 SmallBitVector ZeroElts(NumMaskElts, false);
26974 SmallBitVector ConstantElts(NumMaskElts, false);
26975 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
26976 APInt::getNullValue(MaskSizeInBits));
26977 for (unsigned i = 0; i != NumMaskElts; ++i) {
26979 if (M == SM_SentinelUndef) {
26980 UndefElts[i] = true;
26982 } else if (M == SM_SentinelZero) {
26983 ZeroElts[i] = true;
26986 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
26988 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
26989 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
26991 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
26992 if (SrcUndefElts[SrcMaskIdx]) {
26993 UndefElts[i] = true;
26997 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
26998 APInt &Bits = SrcEltBits[SrcMaskIdx];
27000 ZeroElts[i] = true;
27004 ConstantElts[i] = true;
27005 ConstantBitData[i] = Bits;
27007 assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
27009 // Create the constant data.
27011 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27012 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27014 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27016 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27019 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27020 DCI.AddToWorklist(CstOp.getNode());
27021 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27025 /// \brief Fully generic combining of x86 shuffle instructions.
27027 /// This should be the last combine run over the x86 shuffle instructions. Once
27028 /// they have been fully optimized, this will recursively consider all chains
27029 /// of single-use shuffle instructions, build a generic model of the cumulative
27030 /// shuffle operation, and check for simpler instructions which implement this
27031 /// operation. We use this primarily for two purposes:
27033 /// 1) Collapse generic shuffles to specialized single instructions when
27034 /// equivalent. In most cases, this is just an encoding size win, but
27035 /// sometimes we will collapse multiple generic shuffles into a single
27036 /// special-purpose shuffle.
27037 /// 2) Look for sequences of shuffle instructions with 3 or more total
27038 /// instructions, and replace them with the slightly more expensive SSSE3
27039 /// PSHUFB instruction if available. We do this as the last combining step
27040 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27041 /// a suitable short sequence of other instructions. The PSHUFB will either
27042 /// use a register or have to read from memory and so is slightly (but only
27043 /// slightly) more expensive than the other shuffle instructions.
27045 /// Because this is inherently a quadratic operation (for each shuffle in
27046 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27047 /// This should never be an issue in practice as the shuffle lowering doesn't
27048 /// produce sequences of more than 8 instructions.
27050 /// FIXME: We will currently miss some cases where the redundant shuffling
27051 /// would simplify under the threshold for PSHUFB formation because of
27052 /// combine-ordering. To fix this, we should do the redundant instruction
27053 /// combining in this recursive walk.
27054 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27055 int SrcOpIndex, SDValue Root,
27056 ArrayRef<int> RootMask,
27057 int Depth, bool HasVariableMask,
27059 TargetLowering::DAGCombinerInfo &DCI,
27060 const X86Subtarget &Subtarget) {
27061 // Bound the depth of our recursive combine because this is ultimately
27062 // quadratic in nature.
27066 // Directly rip through bitcasts to find the underlying operand.
27067 SDValue Op = SrcOps[SrcOpIndex];
27068 Op = peekThroughOneUseBitcasts(Op);
27070 MVT VT = Op.getSimpleValueType();
27071 if (!VT.isVector())
27072 return false; // Bail if we hit a non-vector.
27074 assert(Root.getSimpleValueType().isVector() &&
27075 "Shuffles operate on vector types!");
27076 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27077 "Can only combine shuffles of the same vector register size.");
27079 // Extract target shuffle mask and resolve sentinels and inputs.
27080 SDValue Input0, Input1;
27081 SmallVector<int, 16> OpMask;
27082 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
27085 // Add the inputs to the Ops list, avoiding duplicates.
27086 SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
27088 int InputIdx0 = -1, InputIdx1 = -1;
27089 for (int i = 0, e = Ops.size(); i < e; ++i) {
27090 SDValue BC = peekThroughBitcasts(Ops[i]);
27091 if (Input0 && BC == peekThroughBitcasts(Input0))
27093 if (Input1 && BC == peekThroughBitcasts(Input1))
27097 if (Input0 && InputIdx0 < 0) {
27098 InputIdx0 = SrcOpIndex;
27099 Ops[SrcOpIndex] = Input0;
27101 if (Input1 && InputIdx1 < 0) {
27102 InputIdx1 = Ops.size();
27103 Ops.push_back(Input1);
27106 assert(((RootMask.size() > OpMask.size() &&
27107 RootMask.size() % OpMask.size() == 0) ||
27108 (OpMask.size() > RootMask.size() &&
27109 OpMask.size() % RootMask.size() == 0) ||
27110 OpMask.size() == RootMask.size()) &&
27111 "The smaller number of elements must divide the larger.");
27112 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27113 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27114 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27115 assert(((RootRatio == 1 && OpRatio == 1) ||
27116 (RootRatio == 1) != (OpRatio == 1)) &&
27117 "Must not have a ratio for both incoming and op masks!");
27119 SmallVector<int, 16> Mask;
27120 Mask.reserve(MaskWidth);
27122 // Merge this shuffle operation's mask into our accumulated mask. Note that
27123 // this shuffle's mask will be the first applied to the input, followed by the
27124 // root mask to get us all the way to the root value arrangement. The reason
27125 // for this order is that we are recursing up the operation chain.
27126 for (int i = 0; i < MaskWidth; ++i) {
27127 int RootIdx = i / RootRatio;
27128 if (RootMask[RootIdx] < 0) {
27129 // This is a zero or undef lane, we're done.
27130 Mask.push_back(RootMask[RootIdx]);
27134 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27136 // Just insert the scaled root mask value if it references an input other
27137 // than the SrcOp we're currently inserting.
27138 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27139 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27140 Mask.push_back(RootMaskedIdx);
27144 RootMaskedIdx %= MaskWidth;
27146 int OpIdx = RootMaskedIdx / OpRatio;
27147 if (OpMask[OpIdx] < 0) {
27148 // The incoming lanes are zero or undef, it doesn't matter which ones we
27150 Mask.push_back(OpMask[OpIdx]);
27154 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27155 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27156 OpMaskedIdx %= MaskWidth;
27158 if (OpMask[OpIdx] < (int)OpMask.size()) {
27159 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27160 OpMaskedIdx += InputIdx0 * MaskWidth;
27162 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27163 OpMaskedIdx += InputIdx1 * MaskWidth;
27166 Mask.push_back(OpMaskedIdx);
27169 // Handle the all undef/zero cases early.
27170 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27171 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27174 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27175 // TODO - should we handle the mixed zero/undef case as well? Just returning
27176 // a zero mask will lose information on undef elements possibly reducing
27177 // future combine possibilities.
27178 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27179 Subtarget, DAG, SDLoc(Root)));
27183 // Remove unused shuffle source ops.
27184 SmallVector<SDValue, 8> UsedOps;
27185 for (int i = 0, e = Ops.size(); i < e; ++i) {
27186 int lo = UsedOps.size() * MaskWidth;
27187 int hi = lo + MaskWidth;
27188 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
27189 UsedOps.push_back(Ops[i]);
27192 for (int &M : Mask)
27196 assert(!UsedOps.empty() && "Shuffle with no inputs detected");
27199 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27201 // See if we can recurse into each shuffle source op (if it's a target shuffle).
27202 for (int i = 0, e = Ops.size(); i < e; ++i)
27203 if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
27204 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
27205 HasVariableMask, DAG, DCI, Subtarget))
27208 // Attempt to constant fold all of the constant source ops.
27209 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
27213 // We can only combine unary and binary shuffle mask cases.
27214 if (Ops.size() > 2)
27217 // Minor canonicalization of the accumulated shuffle mask to make it easier
27218 // to match below. All this does is detect masks with sequential pairs of
27219 // elements, and shrink them to the half-width mask. It does this in a loop
27220 // so it will reduce the size of the mask to the minimal width mask which
27221 // performs an equivalent shuffle.
27222 SmallVector<int, 16> WidenedMask;
27223 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
27224 Mask = std::move(WidenedMask);
27227 // Canonicalization of binary shuffle masks to improve pattern matching by
27228 // commuting the inputs.
27229 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
27230 ShuffleVectorSDNode::commuteMask(Mask);
27231 std::swap(Ops[0], Ops[1]);
27234 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
27238 /// \brief Get the PSHUF-style mask from PSHUF node.
27240 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
27241 /// PSHUF-style masks that can be reused with such instructions.
27242 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
27243 MVT VT = N.getSimpleValueType();
27244 SmallVector<int, 4> Mask;
27245 SmallVector<SDValue, 2> Ops;
27248 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
27252 // If we have more than 128-bits, only the low 128-bits of shuffle mask
27253 // matter. Check that the upper masks are repeats and remove them.
27254 if (VT.getSizeInBits() > 128) {
27255 int LaneElts = 128 / VT.getScalarSizeInBits();
27257 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
27258 for (int j = 0; j < LaneElts; ++j)
27259 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
27260 "Mask doesn't repeat in high 128-bit lanes!");
27262 Mask.resize(LaneElts);
27265 switch (N.getOpcode()) {
27266 case X86ISD::PSHUFD:
27268 case X86ISD::PSHUFLW:
27271 case X86ISD::PSHUFHW:
27272 Mask.erase(Mask.begin(), Mask.begin() + 4);
27273 for (int &M : Mask)
27277 llvm_unreachable("No valid shuffle instruction found!");
27281 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
27283 /// We walk up the chain and look for a combinable shuffle, skipping over
27284 /// shuffles that we could hoist this shuffle's transformation past without
27285 /// altering anything.
27287 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
27289 TargetLowering::DAGCombinerInfo &DCI) {
27290 assert(N.getOpcode() == X86ISD::PSHUFD &&
27291 "Called with something other than an x86 128-bit half shuffle!");
27294 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
27295 // of the shuffles in the chain so that we can form a fresh chain to replace
27297 SmallVector<SDValue, 8> Chain;
27298 SDValue V = N.getOperand(0);
27299 for (; V.hasOneUse(); V = V.getOperand(0)) {
27300 switch (V.getOpcode()) {
27302 return SDValue(); // Nothing combined!
27305 // Skip bitcasts as we always know the type for the target specific
27309 case X86ISD::PSHUFD:
27310 // Found another dword shuffle.
27313 case X86ISD::PSHUFLW:
27314 // Check that the low words (being shuffled) are the identity in the
27315 // dword shuffle, and the high words are self-contained.
27316 if (Mask[0] != 0 || Mask[1] != 1 ||
27317 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
27320 Chain.push_back(V);
27323 case X86ISD::PSHUFHW:
27324 // Check that the high words (being shuffled) are the identity in the
27325 // dword shuffle, and the low words are self-contained.
27326 if (Mask[2] != 2 || Mask[3] != 3 ||
27327 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
27330 Chain.push_back(V);
27333 case X86ISD::UNPCKL:
27334 case X86ISD::UNPCKH:
27335 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
27336 // shuffle into a preceding word shuffle.
27337 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
27338 V.getSimpleValueType().getVectorElementType() != MVT::i16)
27341 // Search for a half-shuffle which we can combine with.
27342 unsigned CombineOp =
27343 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
27344 if (V.getOperand(0) != V.getOperand(1) ||
27345 !V->isOnlyUserOf(V.getOperand(0).getNode()))
27347 Chain.push_back(V);
27348 V = V.getOperand(0);
27350 switch (V.getOpcode()) {
27352 return SDValue(); // Nothing to combine.
27354 case X86ISD::PSHUFLW:
27355 case X86ISD::PSHUFHW:
27356 if (V.getOpcode() == CombineOp)
27359 Chain.push_back(V);
27363 V = V.getOperand(0);
27367 } while (V.hasOneUse());
27370 // Break out of the loop if we break out of the switch.
27374 if (!V.hasOneUse())
27375 // We fell out of the loop without finding a viable combining instruction.
27378 // Merge this node's mask and our incoming mask.
27379 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27380 for (int &M : Mask)
27382 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
27383 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27385 // Rebuild the chain around this new shuffle.
27386 while (!Chain.empty()) {
27387 SDValue W = Chain.pop_back_val();
27389 if (V.getValueType() != W.getOperand(0).getValueType())
27390 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
27392 switch (W.getOpcode()) {
27394 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
27396 case X86ISD::UNPCKL:
27397 case X86ISD::UNPCKH:
27398 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
27401 case X86ISD::PSHUFD:
27402 case X86ISD::PSHUFLW:
27403 case X86ISD::PSHUFHW:
27404 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
27408 if (V.getValueType() != N.getValueType())
27409 V = DAG.getBitcast(N.getValueType(), V);
27411 // Return the new chain to replace N.
27415 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
27418 /// We walk up the chain, skipping shuffles of the other half and looking
27419 /// through shuffles which switch halves trying to find a shuffle of the same
27420 /// pair of dwords.
27421 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
27423 TargetLowering::DAGCombinerInfo &DCI) {
27425 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
27426 "Called with something other than an x86 128-bit half shuffle!");
27428 unsigned CombineOpcode = N.getOpcode();
27430 // Walk up a single-use chain looking for a combinable shuffle.
27431 SDValue V = N.getOperand(0);
27432 for (; V.hasOneUse(); V = V.getOperand(0)) {
27433 switch (V.getOpcode()) {
27435 return false; // Nothing combined!
27438 // Skip bitcasts as we always know the type for the target specific
27442 case X86ISD::PSHUFLW:
27443 case X86ISD::PSHUFHW:
27444 if (V.getOpcode() == CombineOpcode)
27447 // Other-half shuffles are no-ops.
27450 // Break out of the loop if we break out of the switch.
27454 if (!V.hasOneUse())
27455 // We fell out of the loop without finding a viable combining instruction.
27458 // Combine away the bottom node as its shuffle will be accumulated into
27459 // a preceding shuffle.
27460 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27462 // Record the old value.
27465 // Merge this node's mask and our incoming mask (adjusted to account for all
27466 // the pshufd instructions encountered).
27467 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27468 for (int &M : Mask)
27470 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
27471 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27473 // Check that the shuffles didn't cancel each other out. If not, we need to
27474 // combine to the new one.
27476 // Replace the combinable shuffle with the combined one, updating all users
27477 // so that we re-evaluate the chain here.
27478 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
27483 /// \brief Try to combine x86 target specific shuffles.
27484 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
27485 TargetLowering::DAGCombinerInfo &DCI,
27486 const X86Subtarget &Subtarget) {
27488 MVT VT = N.getSimpleValueType();
27489 SmallVector<int, 4> Mask;
27491 unsigned Opcode = N.getOpcode();
27493 case X86ISD::PSHUFD:
27494 case X86ISD::PSHUFLW:
27495 case X86ISD::PSHUFHW:
27496 Mask = getPSHUFShuffleMask(N);
27497 assert(Mask.size() == 4);
27499 case X86ISD::UNPCKL: {
27500 auto Op0 = N.getOperand(0);
27501 auto Op1 = N.getOperand(1);
27502 unsigned Opcode0 = Op0.getOpcode();
27503 unsigned Opcode1 = Op1.getOpcode();
27505 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
27506 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
27507 // TODO: Add other horizontal operations as required.
27508 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
27509 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
27511 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
27512 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
27513 // moves upper half elements into the lower half part. For example:
27515 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
27517 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
27519 // will be combined to:
27521 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
27523 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
27524 // happen due to advanced instructions.
27525 if (!VT.is128BitVector())
27528 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
27529 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
27531 unsigned NumElts = VT.getVectorNumElements();
27532 SmallVector<int, 8> ExpectedMask(NumElts, -1);
27533 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
27536 auto ShufOp = Op1.getOperand(0);
27537 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
27538 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
27542 case X86ISD::BLENDI: {
27543 SDValue V0 = N->getOperand(0);
27544 SDValue V1 = N->getOperand(1);
27545 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
27546 "Unexpected input vector types");
27548 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
27549 // operands and changing the mask to 1. This saves us a bunch of
27550 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
27551 // x86InstrInfo knows how to commute this back after instruction selection
27552 // if it would help register allocation.
27554 // TODO: If optimizing for size or a processor that doesn't suffer from
27555 // partial register update stalls, this should be transformed into a MOVSD
27556 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
27558 if (VT == MVT::v2f64)
27559 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
27560 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
27561 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
27562 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
27567 case X86ISD::MOVSD:
27568 case X86ISD::MOVSS: {
27569 bool isFloat = VT.isFloatingPoint();
27570 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
27571 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
27572 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
27573 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
27574 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
27575 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
27576 assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
27578 // We often lower to MOVSD/MOVSS from integer as well as native float
27579 // types; remove unnecessary domain-crossing bitcasts if we can to make it
27580 // easier to combine shuffles later on. We've already accounted for the
27581 // domain switching cost when we decided to lower with it.
27582 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
27583 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
27584 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
27585 V0 = DAG.getBitcast(NewVT, V0);
27586 V1 = DAG.getBitcast(NewVT, V1);
27587 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
27592 case X86ISD::INSERTPS: {
27593 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
27594 SDValue Op0 = N.getOperand(0);
27595 SDValue Op1 = N.getOperand(1);
27596 SDValue Op2 = N.getOperand(2);
27597 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
27598 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
27599 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
27600 unsigned ZeroMask = InsertPSMask & 0xF;
27602 // If we zero out all elements from Op0 then we don't need to reference it.
27603 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
27604 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
27605 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27607 // If we zero out the element from Op1 then we don't need to reference it.
27608 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
27609 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27610 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27612 // Attempt to merge insertps Op1 with an inner target shuffle node.
27613 SmallVector<int, 8> TargetMask1;
27614 SmallVector<SDValue, 2> Ops1;
27615 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
27616 int M = TargetMask1[SrcIdx];
27617 if (isUndefOrZero(M)) {
27618 // Zero/UNDEF insertion - zero out element and remove dependency.
27619 InsertPSMask |= (1u << DstIdx);
27620 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27621 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27623 // Update insertps mask srcidx and reference the source input directly.
27624 assert(0 <= M && M < 8 && "Shuffle index out of range");
27625 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
27626 Op1 = Ops1[M < 4 ? 0 : 1];
27627 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27628 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27631 // Attempt to merge insertps Op0 with an inner target shuffle node.
27632 SmallVector<int, 8> TargetMask0;
27633 SmallVector<SDValue, 2> Ops0;
27634 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
27637 bool Updated = false;
27638 bool UseInput00 = false;
27639 bool UseInput01 = false;
27640 for (int i = 0; i != 4; ++i) {
27641 int M = TargetMask0[i];
27642 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
27643 // No change if element is already zero or the inserted element.
27645 } else if (isUndefOrZero(M)) {
27646 // If the target mask is undef/zero then we must zero the element.
27647 InsertPSMask |= (1u << i);
27652 // The input vector element must be inline.
27653 if (M != i && M != (i + 4))
27656 // Determine which inputs of the target shuffle we're using.
27657 UseInput00 |= (0 <= M && M < 4);
27658 UseInput01 |= (4 <= M);
27661 // If we're not using both inputs of the target shuffle then use the
27662 // referenced input directly.
27663 if (UseInput00 && !UseInput01) {
27666 } else if (!UseInput00 && UseInput01) {
27672 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27673 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27681 // Nuke no-op shuffles that show up after combining.
27682 if (isNoopShuffleMask(Mask))
27683 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27685 // Look for simplifications involving one or two shuffle instructions.
27686 SDValue V = N.getOperand(0);
27687 switch (N.getOpcode()) {
27690 case X86ISD::PSHUFLW:
27691 case X86ISD::PSHUFHW:
27692 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
27694 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
27695 return SDValue(); // We combined away this shuffle, so we're done.
27697 // See if this reduces to a PSHUFD which is no more expensive and can
27698 // combine with more operations. Note that it has to at least flip the
27699 // dwords as otherwise it would have been removed as a no-op.
27700 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
27701 int DMask[] = {0, 1, 2, 3};
27702 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
27703 DMask[DOffset + 0] = DOffset + 1;
27704 DMask[DOffset + 1] = DOffset + 0;
27705 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27706 V = DAG.getBitcast(DVT, V);
27707 DCI.AddToWorklist(V.getNode());
27708 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
27709 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
27710 DCI.AddToWorklist(V.getNode());
27711 return DAG.getBitcast(VT, V);
27714 // Look for shuffle patterns which can be implemented as a single unpack.
27715 // FIXME: This doesn't handle the location of the PSHUFD generically, and
27716 // only works when we have a PSHUFD followed by two half-shuffles.
27717 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
27718 (V.getOpcode() == X86ISD::PSHUFLW ||
27719 V.getOpcode() == X86ISD::PSHUFHW) &&
27720 V.getOpcode() != N.getOpcode() &&
27722 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
27723 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
27724 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27725 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
27726 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27727 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
27729 for (int i = 0; i < 4; ++i) {
27730 WordMask[i + NOffset] = Mask[i] + NOffset;
27731 WordMask[i + VOffset] = VMask[i] + VOffset;
27733 // Map the word mask through the DWord mask.
27735 for (int i = 0; i < 8; ++i)
27736 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
27737 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
27738 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
27739 // We can replace all three shuffles with an unpack.
27740 V = DAG.getBitcast(VT, D.getOperand(0));
27741 DCI.AddToWorklist(V.getNode());
27742 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
27751 case X86ISD::PSHUFD:
27752 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
27761 /// \brief Try to combine a shuffle into a target-specific add-sub node.
27763 /// We combine this directly on the abstract vector shuffle nodes so it is
27764 /// easier to generically match. We also insert dummy vector shuffle nodes for
27765 /// the operands which explicitly discard the lanes which are unused by this
27766 /// operation to try to flow through the rest of the combiner the fact that
27767 /// they're unused.
27768 static SDValue combineShuffleToAddSub(SDNode *N, const X86Subtarget &Subtarget,
27769 SelectionDAG &DAG) {
27771 EVT VT = N->getValueType(0);
27772 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
27773 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)))
27776 // We only handle target-independent shuffles.
27777 // FIXME: It would be easy and harmless to use the target shuffle mask
27778 // extraction tool to support more.
27779 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
27782 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
27783 SmallVector<int, 8> Mask(OrigMask.begin(), OrigMask.end());
27785 SDValue V1 = N->getOperand(0);
27786 SDValue V2 = N->getOperand(1);
27788 // We require the first shuffle operand to be the FSUB node, and the second to
27789 // be the FADD node.
27790 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
27791 ShuffleVectorSDNode::commuteMask(Mask);
27793 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
27796 // If there are other uses of these operations we can't fold them.
27797 if (!V1->hasOneUse() || !V2->hasOneUse())
27800 // Ensure that both operations have the same operands. Note that we can
27801 // commute the FADD operands.
27802 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
27803 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
27804 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
27807 // We're looking for blends between FADD and FSUB nodes. We insist on these
27808 // nodes being lined up in a specific expected pattern.
27809 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
27810 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
27811 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15})))
27814 return DAG.getNode(X86ISD::ADDSUB, DL, VT, LHS, RHS);
27817 // We are looking for a shuffle where both sources are concatenated with undef
27818 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
27819 // if we can express this as a single-source shuffle, that's preferable.
27820 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
27821 const X86Subtarget &Subtarget) {
27822 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
27825 EVT VT = N->getValueType(0);
27827 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
27828 if (!VT.is128BitVector() && !VT.is256BitVector())
27831 if (VT.getVectorElementType() != MVT::i32 &&
27832 VT.getVectorElementType() != MVT::i64 &&
27833 VT.getVectorElementType() != MVT::f32 &&
27834 VT.getVectorElementType() != MVT::f64)
27837 SDValue N0 = N->getOperand(0);
27838 SDValue N1 = N->getOperand(1);
27840 // Check that both sources are concats with undef.
27841 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
27842 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
27843 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
27844 !N1.getOperand(1).isUndef())
27847 // Construct the new shuffle mask. Elements from the first source retain their
27848 // index, but elements from the second source no longer need to skip an undef.
27849 SmallVector<int, 8> Mask;
27850 int NumElts = VT.getVectorNumElements();
27852 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27853 for (int Elt : SVOp->getMask())
27854 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
27857 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
27859 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
27862 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
27863 TargetLowering::DAGCombinerInfo &DCI,
27864 const X86Subtarget &Subtarget) {
27866 EVT VT = N->getValueType(0);
27868 // Don't create instructions with illegal types after legalize types has run.
27869 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
27870 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
27873 // If we have legalized the vector types, look for blends of FADD and FSUB
27874 // nodes that we can fuse into an ADDSUB node.
27875 if (TLI.isTypeLegal(VT))
27876 if (SDValue AddSub = combineShuffleToAddSub(N, Subtarget, DAG))
27879 // During Type Legalization, when promoting illegal vector types,
27880 // the backend might introduce new shuffle dag nodes and bitcasts.
27882 // This code performs the following transformation:
27883 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
27884 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
27886 // We do this only if both the bitcast and the BINOP dag nodes have
27887 // one use. Also, perform this transformation only if the new binary
27888 // operation is legal. This is to avoid introducing dag nodes that
27889 // potentially need to be further expanded (or custom lowered) into a
27890 // less optimal sequence of dag nodes.
27891 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
27892 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
27893 N->getOperand(0).getOpcode() == ISD::BITCAST &&
27894 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
27895 SDValue N0 = N->getOperand(0);
27896 SDValue N1 = N->getOperand(1);
27898 SDValue BC0 = N0.getOperand(0);
27899 EVT SVT = BC0.getValueType();
27900 unsigned Opcode = BC0.getOpcode();
27901 unsigned NumElts = VT.getVectorNumElements();
27903 if (BC0.hasOneUse() && SVT.isVector() &&
27904 SVT.getVectorNumElements() * 2 == NumElts &&
27905 TLI.isOperationLegal(Opcode, VT)) {
27906 bool CanFold = false;
27912 // isOperationLegal lies for integer ops on floating point types.
27913 CanFold = VT.isInteger();
27918 // isOperationLegal lies for floating point ops on integer types.
27919 CanFold = VT.isFloatingPoint();
27923 unsigned SVTNumElts = SVT.getVectorNumElements();
27924 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
27925 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
27926 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
27927 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
27928 CanFold = SVOp->getMaskElt(i) < 0;
27931 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
27932 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
27933 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
27934 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
27939 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
27940 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
27941 // consecutive, non-overlapping, and in the right order.
27942 SmallVector<SDValue, 16> Elts;
27943 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
27944 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
27946 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
27949 // For AVX2, we sometimes want to combine
27950 // (vector_shuffle <mask> (concat_vectors t1, undef)
27951 // (concat_vectors t2, undef))
27953 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
27954 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
27955 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
27958 if (isTargetShuffle(N->getOpcode())) {
27960 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
27963 // Try recursively combining arbitrary sequences of x86 shuffle
27964 // instructions into higher-order shuffles. We do this after combining
27965 // specific PSHUF instruction sequences into their minimal form so that we
27966 // can evaluate how many specialized shuffle instructions are involved in
27967 // a particular chain.
27968 SmallVector<int, 1> NonceMask; // Just a placeholder.
27969 NonceMask.push_back(0);
27970 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
27971 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
27973 return SDValue(); // This routine will use CombineTo to replace N.
27979 /// Check if a vector extract from a target-specific shuffle of a load can be
27980 /// folded into a single element load.
27981 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
27982 /// shuffles have been custom lowered so we need to handle those here.
27983 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
27984 TargetLowering::DAGCombinerInfo &DCI) {
27985 if (DCI.isBeforeLegalizeOps())
27988 SDValue InVec = N->getOperand(0);
27989 SDValue EltNo = N->getOperand(1);
27990 EVT EltVT = N->getValueType(0);
27992 if (!isa<ConstantSDNode>(EltNo))
27995 EVT OriginalVT = InVec.getValueType();
27997 if (InVec.getOpcode() == ISD::BITCAST) {
27998 // Don't duplicate a load with other uses.
27999 if (!InVec.hasOneUse())
28001 EVT BCVT = InVec.getOperand(0).getValueType();
28002 if (!BCVT.isVector() ||
28003 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28005 InVec = InVec.getOperand(0);
28008 EVT CurrentVT = InVec.getValueType();
28010 if (!isTargetShuffle(InVec.getOpcode()))
28013 // Don't duplicate a load with other uses.
28014 if (!InVec.hasOneUse())
28017 SmallVector<int, 16> ShuffleMask;
28018 SmallVector<SDValue, 2> ShuffleOps;
28020 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28021 ShuffleOps, ShuffleMask, UnaryShuffle))
28024 // Select the input vector, guarding against out of range extract vector.
28025 unsigned NumElems = CurrentVT.getVectorNumElements();
28026 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28027 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28029 if (Idx == SM_SentinelZero)
28030 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28031 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28032 if (Idx == SM_SentinelUndef)
28033 return DAG.getUNDEF(EltVT);
28035 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28036 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28039 // If inputs to shuffle are the same for both ops, then allow 2 uses
28040 unsigned AllowedUses =
28041 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28043 if (LdNode.getOpcode() == ISD::BITCAST) {
28044 // Don't duplicate a load with other uses.
28045 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28048 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28049 LdNode = LdNode.getOperand(0);
28052 if (!ISD::isNormalLoad(LdNode.getNode()))
28055 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28057 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28060 // If there's a bitcast before the shuffle, check if the load type and
28061 // alignment is valid.
28062 unsigned Align = LN0->getAlignment();
28063 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28064 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28065 EltVT.getTypeForEVT(*DAG.getContext()));
28067 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28070 // All checks match so transform back to vector_shuffle so that DAG combiner
28071 // can finish the job
28074 // Create shuffle node taking into account the case that its a unary shuffle
28075 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28076 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28078 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28079 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28083 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28084 const X86Subtarget &Subtarget) {
28085 SDValue N0 = N->getOperand(0);
28086 EVT VT = N->getValueType(0);
28088 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
28089 // special and don't usually play with other vector types, it's better to
28090 // handle them early to be sure we emit efficient code by avoiding
28091 // store-load conversions.
28092 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28093 N0.getValueType() == MVT::v2i32 &&
28094 isNullConstant(N0.getOperand(1))) {
28095 SDValue N00 = N0->getOperand(0);
28096 if (N00.getValueType() == MVT::i32)
28097 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28100 // Convert a bitcasted integer logic operation that has one bitcasted
28101 // floating-point operand into a floating-point logic operation. This may
28102 // create a load of a constant, but that is cheaper than materializing the
28103 // constant in an integer register and transferring it to an SSE register or
28104 // transferring the SSE operand to integer register and back.
28106 switch (N0.getOpcode()) {
28107 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28108 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28109 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28110 default: return SDValue();
28113 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28114 (Subtarget.hasSSE2() && VT == MVT::f64)))
28117 SDValue LogicOp0 = N0.getOperand(0);
28118 SDValue LogicOp1 = N0.getOperand(1);
28121 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28122 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28123 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28124 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
28125 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
28126 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
28128 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
28129 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
28130 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
28131 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
28132 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
28133 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
28139 // Match a binop + shuffle pyramid that represents a horizontal reduction over
28140 // the elements of a vector.
28141 // Returns the vector that is being reduced on, or SDValue() if a reduction
28142 // was not matched.
28143 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
28144 // The pattern must end in an extract from index 0.
28145 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
28146 !isNullConstant(Extract->getOperand(1)))
28150 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
28152 SDValue Op = Extract->getOperand(0);
28153 // At each stage, we're looking for something that looks like:
28154 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
28155 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
28156 // i32 undef, i32 undef, i32 undef, i32 undef>
28157 // %a = binop <8 x i32> %op, %s
28158 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
28159 // we expect something like:
28160 // <4,5,6,7,u,u,u,u>
28161 // <2,3,u,u,u,u,u,u>
28162 // <1,u,u,u,u,u,u,u>
28163 for (unsigned i = 0; i < Stages; ++i) {
28164 if (Op.getOpcode() != BinOp)
28167 ShuffleVectorSDNode *Shuffle =
28168 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
28170 Op = Op.getOperand(1);
28172 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
28173 Op = Op.getOperand(0);
28176 // The first operand of the shuffle should be the same as the other operand
28178 if (!Shuffle || (Shuffle->getOperand(0) != Op))
28181 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
28182 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
28183 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
28190 // Given a select, detect the following pattern:
28191 // 1: %2 = zext <N x i8> %0 to <N x i32>
28192 // 2: %3 = zext <N x i8> %1 to <N x i32>
28193 // 3: %4 = sub nsw <N x i32> %2, %3
28194 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
28195 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
28196 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
28197 // This is useful as it is the input into a SAD pattern.
28198 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
28200 // Check the condition of the select instruction is greater-than.
28201 SDValue SetCC = Select->getOperand(0);
28202 if (SetCC.getOpcode() != ISD::SETCC)
28204 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
28205 if (CC != ISD::SETGT)
28208 SDValue SelectOp1 = Select->getOperand(1);
28209 SDValue SelectOp2 = Select->getOperand(2);
28211 // The second operand of the select should be the negation of the first
28212 // operand, which is implemented as 0 - SelectOp1.
28213 if (!(SelectOp2.getOpcode() == ISD::SUB &&
28214 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
28215 SelectOp2.getOperand(1) == SelectOp1))
28218 // The first operand of SetCC is the first operand of the select, which is the
28219 // difference between the two input vectors.
28220 if (SetCC.getOperand(0) != SelectOp1)
28223 // The second operand of the comparison can be either -1 or 0.
28224 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
28225 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
28228 // The first operand of the select is the difference between the two input
28230 if (SelectOp1.getOpcode() != ISD::SUB)
28233 Op0 = SelectOp1.getOperand(0);
28234 Op1 = SelectOp1.getOperand(1);
28236 // Check if the operands of the sub are zero-extended from vectors of i8.
28237 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
28238 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
28239 Op1.getOpcode() != ISD::ZERO_EXTEND ||
28240 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
28246 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
28248 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
28249 const SDValue &Zext1, const SDLoc &DL) {
28251 // Find the appropriate width for the PSADBW.
28252 EVT InVT = Zext0.getOperand(0).getValueType();
28253 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
28255 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
28256 // fill in the missing vector elements with 0.
28257 unsigned NumConcat = RegSize / InVT.getSizeInBits();
28258 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
28259 Ops[0] = Zext0.getOperand(0);
28260 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
28261 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28262 Ops[0] = Zext1.getOperand(0);
28263 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28265 // Actually build the SAD
28266 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
28267 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
28270 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
28271 const X86Subtarget &Subtarget) {
28272 // PSADBW is only supported on SSE2 and up.
28273 if (!Subtarget.hasSSE2())
28276 // Verify the type we're extracting from is appropriate
28277 // TODO: There's nothing special about i32, any integer type above i16 should
28278 // work just as well.
28279 EVT VT = Extract->getOperand(0).getValueType();
28280 if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
28283 unsigned RegSize = 128;
28284 if (Subtarget.hasBWI())
28286 else if (Subtarget.hasAVX2())
28289 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
28290 // TODO: We should be able to handle larger vectors by splitting them before
28291 // feeding them into several SADs, and then reducing over those.
28292 if (VT.getSizeInBits() / 4 > RegSize)
28295 // Match shuffle + add pyramid.
28296 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
28298 // If there was a match, we want Root to be a select that is the root of an
28299 // abs-diff pattern.
28300 if (!Root || (Root.getOpcode() != ISD::VSELECT))
28303 // Check whether we have an abs-diff pattern feeding into the select.
28304 SDValue Zext0, Zext1;
28305 if (!detectZextAbsDiff(Root, Zext0, Zext1))
28308 // Create the SAD instruction
28310 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
28312 // If the original vector was wider than 8 elements, sum over the results
28313 // in the SAD vector.
28314 unsigned Stages = Log2_32(VT.getVectorNumElements());
28315 MVT SadVT = SAD.getSimpleValueType();
28317 unsigned SadElems = SadVT.getVectorNumElements();
28319 for(unsigned i = Stages - 3; i > 0; --i) {
28320 SmallVector<int, 16> Mask(SadElems, -1);
28321 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
28322 Mask[j] = MaskEnd + j;
28325 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
28326 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
28330 // Return the lowest i32.
28331 MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
28332 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
28333 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
28334 Extract->getOperand(1));
28337 /// Detect vector gather/scatter index generation and convert it from being a
28338 /// bunch of shuffles and extracts into a somewhat faster sequence.
28339 /// For i686, the best sequence is apparently storing the value and loading
28340 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
28341 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
28342 TargetLowering::DAGCombinerInfo &DCI,
28343 const X86Subtarget &Subtarget) {
28344 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
28347 SDValue InputVector = N->getOperand(0);
28348 SDLoc dl(InputVector);
28349 // Detect mmx to i32 conversion through a v2i32 elt extract.
28350 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
28351 N->getValueType(0) == MVT::i32 &&
28352 InputVector.getValueType() == MVT::v2i32 &&
28353 isa<ConstantSDNode>(N->getOperand(1)) &&
28354 N->getConstantOperandVal(1) == 0) {
28355 SDValue MMXSrc = InputVector.getOperand(0);
28357 // The bitcast source is a direct mmx result.
28358 if (MMXSrc.getValueType() == MVT::x86mmx)
28359 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
28362 EVT VT = N->getValueType(0);
28364 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
28365 InputVector.getOpcode() == ISD::BITCAST &&
28366 isa<ConstantSDNode>(InputVector.getOperand(0))) {
28367 uint64_t ExtractedElt =
28368 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
28369 uint64_t InputValue =
28370 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
28371 uint64_t Res = (InputValue >> ExtractedElt) & 1;
28372 return DAG.getConstant(Res, dl, MVT::i1);
28375 // Check whether this extract is the root of a sum of absolute differences
28376 // pattern. This has to be done here because we really want it to happen
28377 // pre-legalization,
28378 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
28381 // Only operate on vectors of 4 elements, where the alternative shuffling
28382 // gets to be more expensive.
28383 if (InputVector.getValueType() != MVT::v4i32)
28386 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
28387 // single use which is a sign-extend or zero-extend, and all elements are
28389 SmallVector<SDNode *, 4> Uses;
28390 unsigned ExtractedElements = 0;
28391 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
28392 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
28393 if (UI.getUse().getResNo() != InputVector.getResNo())
28396 SDNode *Extract = *UI;
28397 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
28400 if (Extract->getValueType(0) != MVT::i32)
28402 if (!Extract->hasOneUse())
28404 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
28405 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
28407 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
28410 // Record which element was extracted.
28411 ExtractedElements |=
28412 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
28414 Uses.push_back(Extract);
28417 // If not all the elements were used, this may not be worthwhile.
28418 if (ExtractedElements != 15)
28421 // Ok, we've now decided to do the transformation.
28422 // If 64-bit shifts are legal, use the extract-shift sequence,
28423 // otherwise bounce the vector off the cache.
28424 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28427 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
28428 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
28429 auto &DL = DAG.getDataLayout();
28430 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
28431 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28432 DAG.getConstant(0, dl, VecIdxTy));
28433 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28434 DAG.getConstant(1, dl, VecIdxTy));
28436 SDValue ShAmt = DAG.getConstant(
28437 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
28438 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
28439 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28440 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
28441 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
28442 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28443 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
28445 // Store the value to a temporary stack slot.
28446 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
28447 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
28448 MachinePointerInfo());
28450 EVT ElementType = InputVector.getValueType().getVectorElementType();
28451 unsigned EltSize = ElementType.getSizeInBits() / 8;
28453 // Replace each use (extract) with a load of the appropriate element.
28454 for (unsigned i = 0; i < 4; ++i) {
28455 uint64_t Offset = EltSize * i;
28456 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28457 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
28459 SDValue ScalarAddr =
28460 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
28462 // Load the scalar.
28464 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
28468 // Replace the extracts
28469 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
28470 UE = Uses.end(); UI != UE; ++UI) {
28471 SDNode *Extract = *UI;
28473 SDValue Idx = Extract->getOperand(1);
28474 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
28475 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
28478 // The replacement was made in place; don't return anything.
28482 /// If a vector select has an operand that is -1 or 0, simplify the select to a
28483 /// bitwise logic operation.
28484 static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
28485 const X86Subtarget &Subtarget) {
28486 SDValue Cond = N->getOperand(0);
28487 SDValue LHS = N->getOperand(1);
28488 SDValue RHS = N->getOperand(2);
28489 EVT VT = LHS.getValueType();
28490 EVT CondVT = Cond.getValueType();
28492 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28494 if (N->getOpcode() != ISD::VSELECT)
28497 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28498 // Check if the first operand is all zeros.This situation only
28499 // applies to avx512.
28500 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse()) {
28501 //Invert the cond to not(cond) : xor(op,allones)=not(op)
28502 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28503 DAG.getConstant(1, DL, Cond.getValueType()));
28504 //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
28505 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
28507 assert(CondVT.isVector() && "Vector select expects a vector selector!");
28509 // To use the condition operand as a bitwise mask, it must have elements that
28510 // are the same size as the select elements. Ie, the condition operand must
28511 // have already been promoted from the IR select condition type <N x i1>.
28512 // Don't check if the types themselves are equal because that excludes
28513 // vector floating-point selects.
28514 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
28517 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
28518 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
28520 // Try to invert the condition if true value is not all 1s and false value is
28522 if (!TValIsAllOnes && !FValIsAllZeros &&
28523 // Check if the selector will be produced by CMPP*/PCMP*.
28524 Cond.getOpcode() == ISD::SETCC &&
28525 // Check if SETCC has already been promoted.
28526 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
28528 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28529 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
28531 if (TValIsAllZeros || FValIsAllOnes) {
28532 SDValue CC = Cond.getOperand(2);
28533 ISD::CondCode NewCC =
28534 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
28535 Cond.getOperand(0).getValueType().isInteger());
28536 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
28538 std::swap(LHS, RHS);
28539 TValIsAllOnes = FValIsAllOnes;
28540 FValIsAllZeros = TValIsAllZeros;
28544 if (!TValIsAllOnes && !FValIsAllZeros)
28548 if (TValIsAllOnes && FValIsAllZeros)
28550 else if (TValIsAllOnes)
28551 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
28552 else if (FValIsAllZeros)
28553 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
28555 return DAG.getBitcast(VT, Ret);
28558 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
28559 SDValue Cond = N->getOperand(0);
28560 SDValue LHS = N->getOperand(1);
28561 SDValue RHS = N->getOperand(2);
28564 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
28565 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
28566 if (!TrueC || !FalseC)
28569 // Don't do this for crazy integer types.
28570 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
28573 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
28574 // so that TrueC (the true value) is larger than FalseC.
28575 bool NeedsCondInvert = false;
28576 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
28577 // Efficiently invertible.
28578 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
28579 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
28580 isa<ConstantSDNode>(Cond.getOperand(1))))) {
28581 NeedsCondInvert = true;
28582 std::swap(TrueC, FalseC);
28585 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
28586 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
28587 if (NeedsCondInvert) // Invert the condition if needed.
28588 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28589 DAG.getConstant(1, DL, Cond.getValueType()));
28591 // Zero extend the condition if needed.
28592 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
28594 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
28595 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
28596 DAG.getConstant(ShAmt, DL, MVT::i8));
28599 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
28600 if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
28601 if (NeedsCondInvert) // Invert the condition if needed.
28602 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28603 DAG.getConstant(1, DL, Cond.getValueType()));
28605 // Zero extend the condition if needed.
28606 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28607 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28608 SDValue(FalseC, 0));
28611 // Optimize cases that will turn into an LEA instruction. This requires
28612 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
28613 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
28614 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
28615 if (N->getValueType(0) == MVT::i32)
28616 Diff = (unsigned)Diff;
28618 bool isFastMultiplier = false;
28620 switch ((unsigned char)Diff) {
28623 case 1: // result = add base, cond
28624 case 2: // result = lea base( , cond*2)
28625 case 3: // result = lea base(cond, cond*2)
28626 case 4: // result = lea base( , cond*4)
28627 case 5: // result = lea base(cond, cond*4)
28628 case 8: // result = lea base( , cond*8)
28629 case 9: // result = lea base(cond, cond*8)
28630 isFastMultiplier = true;
28635 if (isFastMultiplier) {
28636 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
28637 if (NeedsCondInvert) // Invert the condition if needed.
28638 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28639 DAG.getConstant(1, DL, Cond.getValueType()));
28641 // Zero extend the condition if needed.
28642 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28643 // Scale the condition by the difference.
28645 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
28646 DAG.getConstant(Diff, DL, Cond.getValueType()));
28648 // Add the base if non-zero.
28649 if (FalseC->getAPIntValue() != 0)
28650 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28651 SDValue(FalseC, 0));
28659 // If this is a bitcasted op that can be represented as another type, push the
28660 // the bitcast to the inputs. This allows more opportunities for pattern
28661 // matching masked instructions. This is called when we know that the operation
28662 // is used as one of the inputs of a vselect.
28663 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
28664 TargetLowering::DAGCombinerInfo &DCI) {
28665 // Make sure we have a bitcast.
28666 if (OrigOp.getOpcode() != ISD::BITCAST)
28669 SDValue Op = OrigOp.getOperand(0);
28671 // If the operation is used by anything other than the bitcast, we shouldn't
28672 // do this combine as that would replicate the operation.
28673 if (!Op.hasOneUse())
28676 MVT VT = OrigOp.getSimpleValueType();
28677 MVT EltVT = VT.getVectorElementType();
28678 SDLoc DL(Op.getNode());
28680 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
28682 Op0 = DAG.getBitcast(VT, Op0);
28683 DCI.AddToWorklist(Op0.getNode());
28684 Op1 = DAG.getBitcast(VT, Op1);
28685 DCI.AddToWorklist(Op1.getNode());
28686 DCI.CombineTo(OrigOp.getNode(),
28687 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
28691 unsigned Opcode = Op.getOpcode();
28693 case X86ISD::PALIGNR:
28694 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
28695 if (!VT.is128BitVector())
28697 Opcode = X86ISD::VALIGN;
28699 case X86ISD::VALIGN: {
28700 if (EltVT != MVT::i32 && EltVT != MVT::i64)
28702 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28703 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28704 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
28705 unsigned EltSize = EltVT.getSizeInBits();
28706 // Make sure we can represent the same shift with the new VT.
28707 if ((ShiftAmt % EltSize) != 0)
28709 Imm = ShiftAmt / EltSize;
28710 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28711 DAG.getConstant(Imm, DL, MVT::i8));
28713 case X86ISD::SHUF128: {
28714 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
28716 // Only change element size, not type.
28717 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
28719 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
28722 case ISD::INSERT_SUBVECTOR: {
28723 unsigned EltSize = EltVT.getSizeInBits();
28724 if (EltSize != 32 && EltSize != 64)
28726 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
28727 // Only change element size, not type.
28728 if (VT.isInteger() != OpEltVT.isInteger())
28730 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
28731 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
28732 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
28733 DCI.AddToWorklist(Op0.getNode());
28734 // Op1 needs to be bitcasted to a smaller vector with the same element type.
28735 SDValue Op1 = Op.getOperand(1);
28736 MVT Op1VT = MVT::getVectorVT(EltVT,
28737 Op1.getSimpleValueType().getSizeInBits() / EltSize);
28738 Op1 = DAG.getBitcast(Op1VT, Op1);
28739 DCI.AddToWorklist(Op1.getNode());
28740 DCI.CombineTo(OrigOp.getNode(),
28741 DAG.getNode(Opcode, DL, VT, Op0, Op1,
28742 DAG.getConstant(Imm, DL, MVT::i8)));
28750 /// Do target-specific dag combines on SELECT and VSELECT nodes.
28751 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
28752 TargetLowering::DAGCombinerInfo &DCI,
28753 const X86Subtarget &Subtarget) {
28755 SDValue Cond = N->getOperand(0);
28756 // Get the LHS/RHS of the select.
28757 SDValue LHS = N->getOperand(1);
28758 SDValue RHS = N->getOperand(2);
28759 EVT VT = LHS.getValueType();
28760 EVT CondVT = Cond.getValueType();
28761 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28763 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
28764 // instructions match the semantics of the common C idiom x<y?x:y but not
28765 // x<=y?x:y, because of how they handle negative zero (which can be
28766 // ignored in unsafe-math mode).
28767 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
28768 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
28769 VT != MVT::f80 && VT != MVT::f128 &&
28770 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
28771 (Subtarget.hasSSE2() ||
28772 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
28773 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28775 unsigned Opcode = 0;
28776 // Check for x CC y ? x : y.
28777 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28778 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28782 // Converting this to a min would handle NaNs incorrectly, and swapping
28783 // the operands would cause it to handle comparisons between positive
28784 // and negative zero incorrectly.
28785 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28786 if (!DAG.getTarget().Options.UnsafeFPMath &&
28787 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28789 std::swap(LHS, RHS);
28791 Opcode = X86ISD::FMIN;
28794 // Converting this to a min would handle comparisons between positive
28795 // and negative zero incorrectly.
28796 if (!DAG.getTarget().Options.UnsafeFPMath &&
28797 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28799 Opcode = X86ISD::FMIN;
28802 // Converting this to a min would handle both negative zeros and NaNs
28803 // incorrectly, but we can swap the operands to fix both.
28804 std::swap(LHS, RHS);
28808 Opcode = X86ISD::FMIN;
28812 // Converting this to a max would handle comparisons between positive
28813 // and negative zero incorrectly.
28814 if (!DAG.getTarget().Options.UnsafeFPMath &&
28815 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
28817 Opcode = X86ISD::FMAX;
28820 // Converting this to a max would handle NaNs incorrectly, and swapping
28821 // the operands would cause it to handle comparisons between positive
28822 // and negative zero incorrectly.
28823 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
28824 if (!DAG.getTarget().Options.UnsafeFPMath &&
28825 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
28827 std::swap(LHS, RHS);
28829 Opcode = X86ISD::FMAX;
28832 // Converting this to a max would handle both negative zeros and NaNs
28833 // incorrectly, but we can swap the operands to fix both.
28834 std::swap(LHS, RHS);
28838 Opcode = X86ISD::FMAX;
28841 // Check for x CC y ? y : x -- a min/max with reversed arms.
28842 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
28843 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
28847 // Converting this to a min would handle comparisons between positive
28848 // and negative zero incorrectly, and swapping the operands would
28849 // cause it to handle NaNs incorrectly.
28850 if (!DAG.getTarget().Options.UnsafeFPMath &&
28851 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
28852 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28854 std::swap(LHS, RHS);
28856 Opcode = X86ISD::FMIN;
28859 // Converting this to a min would handle NaNs incorrectly.
28860 if (!DAG.getTarget().Options.UnsafeFPMath &&
28861 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
28863 Opcode = X86ISD::FMIN;
28866 // Converting this to a min would handle both negative zeros and NaNs
28867 // incorrectly, but we can swap the operands to fix both.
28868 std::swap(LHS, RHS);
28872 Opcode = X86ISD::FMIN;
28876 // Converting this to a max would handle NaNs incorrectly.
28877 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28879 Opcode = X86ISD::FMAX;
28882 // Converting this to a max would handle comparisons between positive
28883 // and negative zero incorrectly, and swapping the operands would
28884 // cause it to handle NaNs incorrectly.
28885 if (!DAG.getTarget().Options.UnsafeFPMath &&
28886 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
28887 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
28889 std::swap(LHS, RHS);
28891 Opcode = X86ISD::FMAX;
28894 // Converting this to a max would handle both negative zeros and NaNs
28895 // incorrectly, but we can swap the operands to fix both.
28896 std::swap(LHS, RHS);
28900 Opcode = X86ISD::FMAX;
28906 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
28909 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
28910 // lowering on KNL. In this case we convert it to
28911 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
28912 // The same situation for all 128 and 256-bit vectors of i8 and i16.
28913 // Since SKX these selects have a proper lowering.
28914 if (Subtarget.hasAVX512() && CondVT.isVector() &&
28915 CondVT.getVectorElementType() == MVT::i1 &&
28916 (VT.is128BitVector() || VT.is256BitVector()) &&
28917 (VT.getVectorElementType() == MVT::i8 ||
28918 VT.getVectorElementType() == MVT::i16) &&
28919 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
28920 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
28921 DCI.AddToWorklist(Cond.getNode());
28922 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
28925 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
28928 // Canonicalize max and min:
28929 // (x > y) ? x : y -> (x >= y) ? x : y
28930 // (x < y) ? x : y -> (x <= y) ? x : y
28931 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
28932 // the need for an extra compare
28933 // against zero. e.g.
28934 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
28936 // testl %edi, %edi
28938 // cmovgl %edi, %eax
28942 // cmovsl %eax, %edi
28943 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
28944 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
28945 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
28946 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28951 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
28952 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
28953 Cond.getOperand(0), Cond.getOperand(1), NewCC);
28954 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
28959 // Early exit check
28960 if (!TLI.isTypeLegal(VT))
28963 // Match VSELECTs into subs with unsigned saturation.
28964 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
28965 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
28966 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
28967 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
28968 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
28970 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
28971 // left side invert the predicate to simplify logic below.
28973 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
28975 CC = ISD::getSetCCInverse(CC, true);
28976 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
28980 if (Other.getNode() && Other->getNumOperands() == 2 &&
28981 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
28982 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
28983 SDValue CondRHS = Cond->getOperand(1);
28985 // Look for a general sub with unsigned saturation first.
28986 // x >= y ? x-y : 0 --> subus x, y
28987 // x > y ? x-y : 0 --> subus x, y
28988 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
28989 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
28990 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
28992 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
28993 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
28994 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
28995 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
28996 // If the RHS is a constant we have to reverse the const
28997 // canonicalization.
28998 // x > C-1 ? x+-C : 0 --> subus x, C
28999 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
29000 CondRHSConst->getAPIntValue() ==
29001 (-OpRHSConst->getAPIntValue() - 1))
29002 return DAG.getNode(
29003 X86ISD::SUBUS, DL, VT, OpLHS,
29004 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
29006 // Another special case: If C was a sign bit, the sub has been
29007 // canonicalized into a xor.
29008 // FIXME: Would it be better to use computeKnownBits to determine
29009 // whether it's safe to decanonicalize the xor?
29010 // x s< 0 ? x^C : 0 --> subus x, C
29011 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
29012 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
29013 OpRHSConst->getAPIntValue().isSignBit())
29014 // Note that we have to rebuild the RHS constant here to ensure we
29015 // don't rely on particular values of undef lanes.
29016 return DAG.getNode(
29017 X86ISD::SUBUS, DL, VT, OpLHS,
29018 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
29023 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
29026 // If this is a *dynamic* select (non-constant condition) and we can match
29027 // this node with one of the variable blend instructions, restructure the
29028 // condition so that the blends can use the high bit of each element and use
29029 // SimplifyDemandedBits to simplify the condition operand.
29030 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
29031 !DCI.isBeforeLegalize() &&
29032 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
29033 unsigned BitWidth = Cond.getScalarValueSizeInBits();
29035 // Don't optimize vector selects that map to mask-registers.
29039 // We can only handle the cases where VSELECT is directly legal on the
29040 // subtarget. We custom lower VSELECT nodes with constant conditions and
29041 // this makes it hard to see whether a dynamic VSELECT will correctly
29042 // lower, so we both check the operation's status and explicitly handle the
29043 // cases where a *dynamic* blend will fail even though a constant-condition
29044 // blend could be custom lowered.
29045 // FIXME: We should find a better way to handle this class of problems.
29046 // Potentially, we should combine constant-condition vselect nodes
29047 // pre-legalization into shuffles and not mark as many types as custom
29049 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
29051 // FIXME: We don't support i16-element blends currently. We could and
29052 // should support them by making *all* the bits in the condition be set
29053 // rather than just the high bit and using an i8-element blend.
29054 if (VT.getVectorElementType() == MVT::i16)
29056 // Dynamic blending was only available from SSE4.1 onward.
29057 if (VT.is128BitVector() && !Subtarget.hasSSE41())
29059 // Byte blends are only available in AVX2
29060 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
29063 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
29064 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
29066 APInt KnownZero, KnownOne;
29067 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
29068 DCI.isBeforeLegalizeOps());
29069 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
29070 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
29072 // If we changed the computation somewhere in the DAG, this change
29073 // will affect all users of Cond.
29074 // Make sure it is fine and update all the nodes so that we do not
29075 // use the generic VSELECT anymore. Otherwise, we may perform
29076 // wrong optimizations as we messed up with the actual expectation
29077 // for the vector boolean values.
29078 if (Cond != TLO.Old) {
29079 // Check all uses of that condition operand to check whether it will be
29080 // consumed by non-BLEND instructions, which may depend on all bits are
29082 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29084 if (I->getOpcode() != ISD::VSELECT)
29085 // TODO: Add other opcodes eventually lowered into BLEND.
29088 // Update all the users of the condition, before committing the change,
29089 // so that the VSELECT optimizations that expect the correct vector
29090 // boolean value will not be triggered.
29091 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29093 DAG.ReplaceAllUsesOfValueWith(
29095 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
29096 Cond, I->getOperand(1), I->getOperand(2)));
29097 DCI.CommitTargetLoweringOpt(TLO);
29100 // At this point, only Cond is changed. Change the condition
29101 // just for N to keep the opportunity to optimize all other
29102 // users their own way.
29103 DAG.ReplaceAllUsesOfValueWith(
29105 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
29106 TLO.New, N->getOperand(1), N->getOperand(2)));
29111 // Look for vselects with LHS/RHS being bitcasted from an operation that
29112 // can be executed on another type. Push the bitcast to the inputs of
29113 // the operation. This exposes opportunities for using masking instructions.
29114 if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
29115 CondVT.getVectorElementType() == MVT::i1) {
29116 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
29117 return SDValue(N, 0);
29118 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
29119 return SDValue(N, 0);
29126 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
29128 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
29129 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
29130 /// Note that this is only legal for some op/cc combinations.
29131 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
29132 SelectionDAG &DAG) {
29133 // This combine only operates on CMP-like nodes.
29134 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29135 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29138 // This only applies to variations of the common case:
29139 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
29140 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
29141 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
29142 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
29143 // Using the proper condcodes (see below), overflow is checked for.
29145 // FIXME: We can generalize both constraints:
29146 // - XOR/OR/AND (if they were made to survive AtomicExpand)
29148 // if the result is compared.
29150 SDValue CmpLHS = Cmp.getOperand(0);
29151 SDValue CmpRHS = Cmp.getOperand(1);
29153 if (!CmpLHS.hasOneUse())
29156 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
29157 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
29160 const unsigned Opc = CmpLHS.getOpcode();
29162 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
29165 SDValue OpRHS = CmpLHS.getOperand(2);
29166 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
29170 APInt Addend = OpRHSC->getAPIntValue();
29171 if (Opc == ISD::ATOMIC_LOAD_SUB)
29174 if (CC == X86::COND_S && Addend == 1)
29176 else if (CC == X86::COND_NS && Addend == 1)
29178 else if (CC == X86::COND_G && Addend == -1)
29180 else if (CC == X86::COND_LE && Addend == -1)
29185 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
29186 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
29187 DAG.getUNDEF(CmpLHS.getValueType()));
29188 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
29192 // Check whether a boolean test is testing a boolean value generated by
29193 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
29196 // Simplify the following patterns:
29197 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
29198 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
29199 // to (Op EFLAGS Cond)
29201 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
29202 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
29203 // to (Op EFLAGS !Cond)
29205 // where Op could be BRCOND or CMOV.
29207 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
29208 // This combine only operates on CMP-like nodes.
29209 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29210 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29213 // Quit if not used as a boolean value.
29214 if (CC != X86::COND_E && CC != X86::COND_NE)
29217 // Check CMP operands. One of them should be 0 or 1 and the other should be
29218 // an SetCC or extended from it.
29219 SDValue Op1 = Cmp.getOperand(0);
29220 SDValue Op2 = Cmp.getOperand(1);
29223 const ConstantSDNode* C = nullptr;
29224 bool needOppositeCond = (CC == X86::COND_E);
29225 bool checkAgainstTrue = false; // Is it a comparison against 1?
29227 if ((C = dyn_cast<ConstantSDNode>(Op1)))
29229 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
29231 else // Quit if all operands are not constants.
29234 if (C->getZExtValue() == 1) {
29235 needOppositeCond = !needOppositeCond;
29236 checkAgainstTrue = true;
29237 } else if (C->getZExtValue() != 0)
29238 // Quit if the constant is neither 0 or 1.
29241 bool truncatedToBoolWithAnd = false;
29242 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
29243 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
29244 SetCC.getOpcode() == ISD::TRUNCATE ||
29245 SetCC.getOpcode() == ISD::AND) {
29246 if (SetCC.getOpcode() == ISD::AND) {
29248 if (isOneConstant(SetCC.getOperand(0)))
29250 if (isOneConstant(SetCC.getOperand(1)))
29254 SetCC = SetCC.getOperand(OpIdx);
29255 truncatedToBoolWithAnd = true;
29257 SetCC = SetCC.getOperand(0);
29260 switch (SetCC.getOpcode()) {
29261 case X86ISD::SETCC_CARRY:
29262 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
29263 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
29264 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
29265 // truncated to i1 using 'and'.
29266 if (checkAgainstTrue && !truncatedToBoolWithAnd)
29268 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
29269 "Invalid use of SETCC_CARRY!");
29271 case X86ISD::SETCC:
29272 // Set the condition code or opposite one if necessary.
29273 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
29274 if (needOppositeCond)
29275 CC = X86::GetOppositeBranchCondition(CC);
29276 return SetCC.getOperand(1);
29277 case X86ISD::CMOV: {
29278 // Check whether false/true value has canonical one, i.e. 0 or 1.
29279 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
29280 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
29281 // Quit if true value is not a constant.
29284 // Quit if false value is not a constant.
29286 SDValue Op = SetCC.getOperand(0);
29287 // Skip 'zext' or 'trunc' node.
29288 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
29289 Op.getOpcode() == ISD::TRUNCATE)
29290 Op = Op.getOperand(0);
29291 // A special case for rdrand/rdseed, where 0 is set if false cond is
29293 if ((Op.getOpcode() != X86ISD::RDRAND &&
29294 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
29297 // Quit if false value is not the constant 0 or 1.
29298 bool FValIsFalse = true;
29299 if (FVal && FVal->getZExtValue() != 0) {
29300 if (FVal->getZExtValue() != 1)
29302 // If FVal is 1, opposite cond is needed.
29303 needOppositeCond = !needOppositeCond;
29304 FValIsFalse = false;
29306 // Quit if TVal is not the constant opposite of FVal.
29307 if (FValIsFalse && TVal->getZExtValue() != 1)
29309 if (!FValIsFalse && TVal->getZExtValue() != 0)
29311 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
29312 if (needOppositeCond)
29313 CC = X86::GetOppositeBranchCondition(CC);
29314 return SetCC.getOperand(3);
29321 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
29323 /// (X86or (X86setcc) (X86setcc))
29324 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
29325 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
29326 X86::CondCode &CC1, SDValue &Flags,
29328 if (Cond->getOpcode() == X86ISD::CMP) {
29329 if (!isNullConstant(Cond->getOperand(1)))
29332 Cond = Cond->getOperand(0);
29337 SDValue SetCC0, SetCC1;
29338 switch (Cond->getOpcode()) {
29339 default: return false;
29346 SetCC0 = Cond->getOperand(0);
29347 SetCC1 = Cond->getOperand(1);
29351 // Make sure we have SETCC nodes, using the same flags value.
29352 if (SetCC0.getOpcode() != X86ISD::SETCC ||
29353 SetCC1.getOpcode() != X86ISD::SETCC ||
29354 SetCC0->getOperand(1) != SetCC1->getOperand(1))
29357 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
29358 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
29359 Flags = SetCC0->getOperand(1);
29363 /// Optimize an EFLAGS definition used according to the condition code \p CC
29364 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
29365 /// uses of chain values.
29366 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
29367 SelectionDAG &DAG) {
29368 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
29370 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
29373 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
29374 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
29375 TargetLowering::DAGCombinerInfo &DCI,
29376 const X86Subtarget &Subtarget) {
29379 // If the flag operand isn't dead, don't touch this CMOV.
29380 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
29383 SDValue FalseOp = N->getOperand(0);
29384 SDValue TrueOp = N->getOperand(1);
29385 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
29386 SDValue Cond = N->getOperand(3);
29388 if (CC == X86::COND_E || CC == X86::COND_NE) {
29389 switch (Cond.getOpcode()) {
29393 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
29394 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
29395 return (CC == X86::COND_E) ? FalseOp : TrueOp;
29399 // Try to simplify the EFLAGS and condition code operands.
29400 // We can't always do this as FCMOV only supports a subset of X86 cond.
29401 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
29402 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
29403 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
29405 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29409 // If this is a select between two integer constants, try to do some
29410 // optimizations. Note that the operands are ordered the opposite of SELECT
29412 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
29413 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
29414 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
29415 // larger than FalseC (the false value).
29416 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
29417 CC = X86::GetOppositeBranchCondition(CC);
29418 std::swap(TrueC, FalseC);
29419 std::swap(TrueOp, FalseOp);
29422 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
29423 // This is efficient for any integer data type (including i8/i16) and
29425 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29426 Cond = getSETCC(CC, Cond, DL, DAG);
29428 // Zero extend the condition if needed.
29429 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
29431 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29432 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
29433 DAG.getConstant(ShAmt, DL, MVT::i8));
29434 if (N->getNumValues() == 2) // Dead flag value?
29435 return DCI.CombineTo(N, Cond, SDValue());
29439 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
29440 // for any integer data type, including i8/i16.
29441 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
29442 Cond = getSETCC(CC, Cond, DL, DAG);
29444 // Zero extend the condition if needed.
29445 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
29446 FalseC->getValueType(0), Cond);
29447 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29448 SDValue(FalseC, 0));
29450 if (N->getNumValues() == 2) // Dead flag value?
29451 return DCI.CombineTo(N, Cond, SDValue());
29455 // Optimize cases that will turn into an LEA instruction. This requires
29456 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29457 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29458 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
29459 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
29461 bool isFastMultiplier = false;
29463 switch ((unsigned char)Diff) {
29465 case 1: // result = add base, cond
29466 case 2: // result = lea base( , cond*2)
29467 case 3: // result = lea base(cond, cond*2)
29468 case 4: // result = lea base( , cond*4)
29469 case 5: // result = lea base(cond, cond*4)
29470 case 8: // result = lea base( , cond*8)
29471 case 9: // result = lea base(cond, cond*8)
29472 isFastMultiplier = true;
29477 if (isFastMultiplier) {
29478 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
29479 Cond = getSETCC(CC, Cond, DL ,DAG);
29480 // Zero extend the condition if needed.
29481 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
29483 // Scale the condition by the difference.
29485 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29486 DAG.getConstant(Diff, DL, Cond.getValueType()));
29488 // Add the base if non-zero.
29489 if (FalseC->getAPIntValue() != 0)
29490 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29491 SDValue(FalseC, 0));
29492 if (N->getNumValues() == 2) // Dead flag value?
29493 return DCI.CombineTo(N, Cond, SDValue());
29500 // Handle these cases:
29501 // (select (x != c), e, c) -> select (x != c), e, x),
29502 // (select (x == c), c, e) -> select (x == c), x, e)
29503 // where the c is an integer constant, and the "select" is the combination
29504 // of CMOV and CMP.
29506 // The rationale for this change is that the conditional-move from a constant
29507 // needs two instructions, however, conditional-move from a register needs
29508 // only one instruction.
29510 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
29511 // some instruction-combining opportunities. This opt needs to be
29512 // postponed as late as possible.
29514 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
29515 // the DCI.xxxx conditions are provided to postpone the optimization as
29516 // late as possible.
29518 ConstantSDNode *CmpAgainst = nullptr;
29519 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
29520 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
29521 !isa<ConstantSDNode>(Cond.getOperand(0))) {
29523 if (CC == X86::COND_NE &&
29524 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
29525 CC = X86::GetOppositeBranchCondition(CC);
29526 std::swap(TrueOp, FalseOp);
29529 if (CC == X86::COND_E &&
29530 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
29531 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
29532 DAG.getConstant(CC, DL, MVT::i8), Cond };
29533 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
29538 // Fold and/or of setcc's to double CMOV:
29539 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
29540 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
29542 // This combine lets us generate:
29543 // cmovcc1 (jcc1 if we don't have CMOV)
29549 // cmovne (jne if we don't have CMOV)
29550 // When we can't use the CMOV instruction, it might increase branch
29552 // When we can use CMOV, or when there is no mispredict, this improves
29553 // throughput and reduces register pressure.
29555 if (CC == X86::COND_NE) {
29557 X86::CondCode CC0, CC1;
29559 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
29561 std::swap(FalseOp, TrueOp);
29562 CC0 = X86::GetOppositeBranchCondition(CC0);
29563 CC1 = X86::GetOppositeBranchCondition(CC1);
29566 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
29568 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
29569 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
29570 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29571 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
29579 /// Different mul shrinking modes.
29580 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
29582 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
29583 EVT VT = N->getOperand(0).getValueType();
29584 if (VT.getScalarSizeInBits() != 32)
29587 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
29588 unsigned SignBits[2] = {1, 1};
29589 bool IsPositive[2] = {false, false};
29590 for (unsigned i = 0; i < 2; i++) {
29591 SDValue Opd = N->getOperand(i);
29593 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
29594 // compute signbits for it separately.
29595 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
29596 // For anyextend, it is safe to assume an appropriate number of leading
29598 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
29600 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
29605 IsPositive[i] = true;
29606 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
29607 // All the operands of BUILD_VECTOR need to be int constant.
29608 // Find the smallest value range which all the operands belong to.
29610 IsPositive[i] = true;
29611 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
29612 if (SubOp.isUndef())
29614 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
29617 APInt IntVal = CN->getAPIntValue();
29618 if (IntVal.isNegative())
29619 IsPositive[i] = false;
29620 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
29623 SignBits[i] = DAG.ComputeNumSignBits(Opd);
29624 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
29625 IsPositive[i] = true;
29629 bool AllPositive = IsPositive[0] && IsPositive[1];
29630 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
29631 // When ranges are from -128 ~ 127, use MULS8 mode.
29632 if (MinSignBits >= 25)
29634 // When ranges are from 0 ~ 255, use MULU8 mode.
29635 else if (AllPositive && MinSignBits >= 24)
29637 // When ranges are from -32768 ~ 32767, use MULS16 mode.
29638 else if (MinSignBits >= 17)
29640 // When ranges are from 0 ~ 65535, use MULU16 mode.
29641 else if (AllPositive && MinSignBits >= 16)
29648 /// When the operands of vector mul are extended from smaller size values,
29649 /// like i8 and i16, the type of mul may be shrinked to generate more
29650 /// efficient code. Two typical patterns are handled:
29652 /// %2 = sext/zext <N x i8> %1 to <N x i32>
29653 /// %4 = sext/zext <N x i8> %3 to <N x i32>
29654 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29655 /// %5 = mul <N x i32> %2, %4
29658 /// %2 = zext/sext <N x i16> %1 to <N x i32>
29659 /// %4 = zext/sext <N x i16> %3 to <N x i32>
29660 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
29661 /// %5 = mul <N x i32> %2, %4
29663 /// There are four mul shrinking modes:
29664 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
29665 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
29666 /// generate pmullw+sext32 for it (MULS8 mode).
29667 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
29668 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
29669 /// generate pmullw+zext32 for it (MULU8 mode).
29670 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
29671 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
29672 /// generate pmullw+pmulhw for it (MULS16 mode).
29673 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
29674 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
29675 /// generate pmullw+pmulhuw for it (MULU16 mode).
29676 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
29677 const X86Subtarget &Subtarget) {
29678 // Check for legality
29679 // pmullw/pmulhw are not supported by SSE.
29680 if (!Subtarget.hasSSE2())
29683 // Check for profitability
29684 // pmulld is supported since SSE41. It is better to use pmulld
29685 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
29687 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
29688 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
29692 if (!canReduceVMulWidth(N, DAG, Mode))
29696 SDValue N0 = N->getOperand(0);
29697 SDValue N1 = N->getOperand(1);
29698 EVT VT = N->getOperand(0).getValueType();
29699 unsigned RegSize = 128;
29700 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
29702 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
29703 // Shrink the operands of mul.
29704 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
29705 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
29707 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
29708 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
29709 // lower part is needed.
29710 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
29711 if (Mode == MULU8 || Mode == MULS8) {
29712 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
29715 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
29716 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
29717 // the higher part is also needed.
29718 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29719 ReducedVT, NewN0, NewN1);
29721 // Repack the lower part and higher part result of mul into a wider
29723 // Generate shuffle functioning as punpcklwd.
29724 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
29725 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29726 ShuffleMask[2 * i] = i;
29727 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
29730 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29731 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
29732 // Generate shuffle functioning as punpckhwd.
29733 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
29734 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
29735 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
29738 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
29739 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
29740 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
29743 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
29744 // to legalize the mul explicitly because implicit legalization for type
29745 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
29746 // instructions which will not exist when we explicitly legalize it by
29747 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
29748 // <4 x i16> undef).
29750 // Legalize the operands of mul.
29751 // FIXME: We may be able to handle non-concatenated vectors by insertion.
29752 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
29753 if ((RegSize % ReducedSizeInBits) != 0)
29756 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
29757 DAG.getUNDEF(ReducedVT));
29759 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29761 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
29763 if (Mode == MULU8 || Mode == MULS8) {
29764 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
29766 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29768 // convert the type of mul result to VT.
29769 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29770 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
29771 : ISD::SIGN_EXTEND_VECTOR_INREG,
29773 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29774 DAG.getIntPtrConstant(0, DL));
29776 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
29777 // MULU16/MULS16, both parts are needed.
29778 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
29779 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
29780 OpsVT, NewN0, NewN1);
29782 // Repack the lower part and higher part result of mul into a wider
29783 // result. Make sure the type of mul result is VT.
29784 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
29785 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
29786 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
29787 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
29788 DAG.getIntPtrConstant(0, DL));
29793 /// Optimize a single multiply with constant into two operations in order to
29794 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
29795 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
29796 TargetLowering::DAGCombinerInfo &DCI,
29797 const X86Subtarget &Subtarget) {
29798 EVT VT = N->getValueType(0);
29799 if (DCI.isBeforeLegalize() && VT.isVector())
29800 return reduceVMULWidth(N, DAG, Subtarget);
29802 // An imul is usually smaller than the alternative sequence.
29803 if (DAG.getMachineFunction().getFunction()->optForMinSize())
29806 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
29809 if (VT != MVT::i64 && VT != MVT::i32)
29812 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
29815 uint64_t MulAmt = C->getZExtValue();
29816 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
29819 uint64_t MulAmt1 = 0;
29820 uint64_t MulAmt2 = 0;
29821 if ((MulAmt % 9) == 0) {
29823 MulAmt2 = MulAmt / 9;
29824 } else if ((MulAmt % 5) == 0) {
29826 MulAmt2 = MulAmt / 5;
29827 } else if ((MulAmt % 3) == 0) {
29829 MulAmt2 = MulAmt / 3;
29835 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
29837 if (isPowerOf2_64(MulAmt2) &&
29838 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
29839 // If second multiplifer is pow2, issue it first. We want the multiply by
29840 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
29842 std::swap(MulAmt1, MulAmt2);
29844 if (isPowerOf2_64(MulAmt1))
29845 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
29846 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
29848 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
29849 DAG.getConstant(MulAmt1, DL, VT));
29851 if (isPowerOf2_64(MulAmt2))
29852 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
29853 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
29855 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
29856 DAG.getConstant(MulAmt2, DL, VT));
29860 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
29861 && "Both cases that could cause potential overflows should have "
29862 "already been handled.");
29863 if (isPowerOf2_64(MulAmt - 1))
29864 // (mul x, 2^N + 1) => (add (shl x, N), x)
29865 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
29866 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
29867 DAG.getConstant(Log2_64(MulAmt - 1), DL,
29870 else if (isPowerOf2_64(MulAmt + 1))
29871 // (mul x, 2^N - 1) => (sub (shl x, N), x)
29872 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
29874 DAG.getConstant(Log2_64(MulAmt + 1),
29875 DL, MVT::i8)), N->getOperand(0));
29879 // Do not add new nodes to DAG combiner worklist.
29880 DCI.CombineTo(N, NewMul, false);
29885 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
29886 SDValue N0 = N->getOperand(0);
29887 SDValue N1 = N->getOperand(1);
29888 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
29889 EVT VT = N0.getValueType();
29891 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
29892 // since the result of setcc_c is all zero's or all ones.
29893 if (VT.isInteger() && !VT.isVector() &&
29894 N1C && N0.getOpcode() == ISD::AND &&
29895 N0.getOperand(1).getOpcode() == ISD::Constant) {
29896 SDValue N00 = N0.getOperand(0);
29897 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
29898 const APInt &ShAmt = N1C->getAPIntValue();
29899 Mask = Mask.shl(ShAmt);
29900 bool MaskOK = false;
29901 // We can handle cases concerning bit-widening nodes containing setcc_c if
29902 // we carefully interrogate the mask to make sure we are semantics
29904 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
29905 // of the underlying setcc_c operation if the setcc_c was zero extended.
29906 // Consider the following example:
29907 // zext(setcc_c) -> i32 0x0000FFFF
29908 // c1 -> i32 0x0000FFFF
29909 // c2 -> i32 0x00000001
29910 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
29911 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
29912 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
29914 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
29915 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
29917 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
29918 N00.getOpcode() == ISD::ANY_EXTEND) &&
29919 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
29920 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
29922 if (MaskOK && Mask != 0) {
29924 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
29928 // Hardware support for vector shifts is sparse which makes us scalarize the
29929 // vector operations in many cases. Also, on sandybridge ADD is faster than
29931 // (shl V, 1) -> add V,V
29932 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
29933 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
29934 assert(N0.getValueType().isVector() && "Invalid vector shift type");
29935 // We shift all of the values by one. In many cases we do not have
29936 // hardware support for this operation. This is better expressed as an ADD
29938 if (N1SplatC->getAPIntValue() == 1)
29939 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
29945 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
29946 SDValue N0 = N->getOperand(0);
29947 SDValue N1 = N->getOperand(1);
29948 EVT VT = N0.getValueType();
29949 unsigned Size = VT.getSizeInBits();
29951 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
29952 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
29953 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
29954 // depending on sign of (SarConst - [56,48,32,24,16])
29956 // sexts in X86 are MOVs. The MOVs have the same code size
29957 // as above SHIFTs (only SHIFT on 1 has lower code size).
29958 // However the MOVs have 2 advantages to a SHIFT:
29959 // 1. MOVs can write to a register that differs from source
29960 // 2. MOVs accept memory operands
29962 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
29963 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
29964 N0.getOperand(1).getOpcode() != ISD::Constant)
29967 SDValue N00 = N0.getOperand(0);
29968 SDValue N01 = N0.getOperand(1);
29969 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
29970 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
29971 EVT CVT = N1.getValueType();
29973 if (SarConst.isNegative())
29976 for (MVT SVT : MVT::integer_valuetypes()) {
29977 unsigned ShiftSize = SVT.getSizeInBits();
29978 // skipping types without corresponding sext/zext and
29979 // ShlConst that is not one of [56,48,32,24,16]
29980 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
29984 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
29985 SarConst = SarConst - (Size - ShiftSize);
29988 else if (SarConst.isNegative())
29989 return DAG.getNode(ISD::SHL, DL, VT, NN,
29990 DAG.getConstant(-SarConst, DL, CVT));
29992 return DAG.getNode(ISD::SRA, DL, VT, NN,
29993 DAG.getConstant(SarConst, DL, CVT));
29998 /// \brief Returns a vector of 0s if the node in input is a vector logical
29999 /// shift by a constant amount which is known to be bigger than or equal
30000 /// to the vector element size in bits.
30001 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
30002 const X86Subtarget &Subtarget) {
30003 EVT VT = N->getValueType(0);
30005 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
30006 (!Subtarget.hasInt256() ||
30007 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
30010 SDValue Amt = N->getOperand(1);
30012 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
30013 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
30014 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
30015 unsigned MaxAmount =
30016 VT.getSimpleVT().getScalarSizeInBits();
30018 // SSE2/AVX2 logical shifts always return a vector of 0s
30019 // if the shift amount is bigger than or equal to
30020 // the element size. The constant shift amount will be
30021 // encoded as a 8-bit immediate.
30022 if (ShiftAmt.trunc(8).uge(MaxAmount))
30023 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
30029 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
30030 TargetLowering::DAGCombinerInfo &DCI,
30031 const X86Subtarget &Subtarget) {
30032 if (N->getOpcode() == ISD::SHL)
30033 if (SDValue V = combineShiftLeft(N, DAG))
30036 if (N->getOpcode() == ISD::SRA)
30037 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
30040 // Try to fold this logical shift into a zero vector.
30041 if (N->getOpcode() != ISD::SRA)
30042 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
30048 static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
30049 TargetLowering::DAGCombinerInfo &DCI,
30050 const X86Subtarget &Subtarget) {
30051 assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
30052 "Unexpected opcode");
30053 EVT VT = N->getValueType(0);
30054 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
30056 // This fails for mask register (vXi1) shifts.
30057 if ((NumBitsPerElt % 8) != 0)
30060 // Out of range logical bit shifts are guaranteed to be zero.
30061 APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
30062 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
30063 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30065 // Shift N0 by zero -> N0.
30067 return N->getOperand(0);
30069 // Shift zero -> zero.
30070 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
30071 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30073 // We can decode 'whole byte' logical bit shifts as shuffles.
30074 if ((ShiftVal.getZExtValue() % 8) == 0) {
30076 SmallVector<int, 1> NonceMask; // Just a placeholder.
30077 NonceMask.push_back(0);
30078 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30079 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30081 return SDValue(); // This routine will use CombineTo to replace N.
30087 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
30088 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
30089 /// OR -> CMPNEQSS.
30090 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
30091 TargetLowering::DAGCombinerInfo &DCI,
30092 const X86Subtarget &Subtarget) {
30095 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
30096 // we're requiring SSE2 for both.
30097 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
30098 SDValue N0 = N->getOperand(0);
30099 SDValue N1 = N->getOperand(1);
30100 SDValue CMP0 = N0->getOperand(1);
30101 SDValue CMP1 = N1->getOperand(1);
30104 // The SETCCs should both refer to the same CMP.
30105 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
30108 SDValue CMP00 = CMP0->getOperand(0);
30109 SDValue CMP01 = CMP0->getOperand(1);
30110 EVT VT = CMP00.getValueType();
30112 if (VT == MVT::f32 || VT == MVT::f64) {
30113 bool ExpectingFlags = false;
30114 // Check for any users that want flags:
30115 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
30116 !ExpectingFlags && UI != UE; ++UI)
30117 switch (UI->getOpcode()) {
30122 ExpectingFlags = true;
30124 case ISD::CopyToReg:
30125 case ISD::SIGN_EXTEND:
30126 case ISD::ZERO_EXTEND:
30127 case ISD::ANY_EXTEND:
30131 if (!ExpectingFlags) {
30132 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
30133 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
30135 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
30136 X86::CondCode tmp = cc0;
30141 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
30142 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
30143 // FIXME: need symbolic constants for these magic numbers.
30144 // See X86ATTInstPrinter.cpp:printSSECC().
30145 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
30146 if (Subtarget.hasAVX512()) {
30147 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
30149 DAG.getConstant(x86cc, DL, MVT::i8));
30150 if (N->getValueType(0) != MVT::i1)
30151 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
30155 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
30156 CMP00.getValueType(), CMP00, CMP01,
30157 DAG.getConstant(x86cc, DL,
30160 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
30161 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
30163 if (is64BitFP && !Subtarget.is64Bit()) {
30164 // On a 32-bit target, we cannot bitcast the 64-bit float to a
30165 // 64-bit integer, since that's not a legal type. Since
30166 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
30167 // bits, but can do this little dance to extract the lowest 32 bits
30168 // and work with those going forward.
30169 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
30171 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
30172 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
30173 Vector32, DAG.getIntPtrConstant(0, DL));
30177 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
30178 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
30179 DAG.getConstant(1, DL, IntVT));
30180 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
30182 return OneBitOfTruth;
30190 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
30191 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
30192 assert(N->getOpcode() == ISD::AND);
30194 EVT VT = N->getValueType(0);
30195 SDValue N0 = N->getOperand(0);
30196 SDValue N1 = N->getOperand(1);
30199 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
30202 // Canonicalize XOR to the left.
30203 if (N1.getOpcode() == ISD::XOR)
30206 if (N0.getOpcode() != ISD::XOR)
30209 SDValue N00 = N0->getOperand(0);
30210 SDValue N01 = N0->getOperand(1);
30212 N01 = peekThroughBitcasts(N01);
30214 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
30215 // insert_subvector building a 256-bit AllOnes vector.
30216 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
30217 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
30220 SDValue V1 = N01->getOperand(0);
30221 SDValue V2 = N01->getOperand(1);
30222 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
30223 !V1.getOperand(0).isUndef() ||
30224 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
30225 !ISD::isBuildVectorAllOnes(V2.getNode()))
30228 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
30231 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
30232 // register. In most cases we actually compare or select YMM-sized registers
30233 // and mixing the two types creates horrible code. This method optimizes
30234 // some of the transition sequences.
30235 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30236 TargetLowering::DAGCombinerInfo &DCI,
30237 const X86Subtarget &Subtarget) {
30238 EVT VT = N->getValueType(0);
30239 if (!VT.is256BitVector())
30242 assert((N->getOpcode() == ISD::ANY_EXTEND ||
30243 N->getOpcode() == ISD::ZERO_EXTEND ||
30244 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
30246 SDValue Narrow = N->getOperand(0);
30247 EVT NarrowVT = Narrow->getValueType(0);
30248 if (!NarrowVT.is128BitVector())
30251 if (Narrow->getOpcode() != ISD::XOR &&
30252 Narrow->getOpcode() != ISD::AND &&
30253 Narrow->getOpcode() != ISD::OR)
30256 SDValue N0 = Narrow->getOperand(0);
30257 SDValue N1 = Narrow->getOperand(1);
30260 // The Left side has to be a trunc.
30261 if (N0.getOpcode() != ISD::TRUNCATE)
30264 // The type of the truncated inputs.
30265 EVT WideVT = N0->getOperand(0)->getValueType(0);
30269 // The right side has to be a 'trunc' or a constant vector.
30270 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
30271 ConstantSDNode *RHSConstSplat = nullptr;
30272 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
30273 RHSConstSplat = RHSBV->getConstantSplatNode();
30274 if (!RHSTrunc && !RHSConstSplat)
30277 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30279 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
30282 // Set N0 and N1 to hold the inputs to the new wide operation.
30283 N0 = N0->getOperand(0);
30284 if (RHSConstSplat) {
30285 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
30286 SDValue(RHSConstSplat, 0));
30287 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
30288 } else if (RHSTrunc) {
30289 N1 = N1->getOperand(0);
30292 // Generate the wide operation.
30293 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
30294 unsigned Opcode = N->getOpcode();
30296 case ISD::ANY_EXTEND:
30298 case ISD::ZERO_EXTEND: {
30299 unsigned InBits = NarrowVT.getScalarSizeInBits();
30300 APInt Mask = APInt::getAllOnesValue(InBits);
30301 Mask = Mask.zext(VT.getScalarSizeInBits());
30302 return DAG.getNode(ISD::AND, DL, VT,
30303 Op, DAG.getConstant(Mask, DL, VT));
30305 case ISD::SIGN_EXTEND:
30306 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
30307 Op, DAG.getValueType(NarrowVT));
30309 llvm_unreachable("Unexpected opcode");
30313 /// If both input operands of a logic op are being cast from floating point
30314 /// types, try to convert this into a floating point logic node to avoid
30315 /// unnecessary moves from SSE to integer registers.
30316 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
30317 const X86Subtarget &Subtarget) {
30318 unsigned FPOpcode = ISD::DELETED_NODE;
30319 if (N->getOpcode() == ISD::AND)
30320 FPOpcode = X86ISD::FAND;
30321 else if (N->getOpcode() == ISD::OR)
30322 FPOpcode = X86ISD::FOR;
30323 else if (N->getOpcode() == ISD::XOR)
30324 FPOpcode = X86ISD::FXOR;
30326 assert(FPOpcode != ISD::DELETED_NODE &&
30327 "Unexpected input node for FP logic conversion");
30329 EVT VT = N->getValueType(0);
30330 SDValue N0 = N->getOperand(0);
30331 SDValue N1 = N->getOperand(1);
30333 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
30334 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
30335 (Subtarget.hasSSE2() && VT == MVT::i64))) {
30336 SDValue N00 = N0.getOperand(0);
30337 SDValue N10 = N1.getOperand(0);
30338 EVT N00Type = N00.getValueType();
30339 EVT N10Type = N10.getValueType();
30340 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
30341 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
30342 return DAG.getBitcast(VT, FPLogic);
30348 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
30349 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
30350 /// eliminate loading the vector constant mask value. This relies on the fact
30351 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
30352 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
30353 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
30354 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
30356 // TODO: Use AssertSext to mark any nodes that have the property of producing
30357 // all-ones or all-zeros. Then check for that node rather than particular
30359 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
30362 // The existence of the PCMP node guarantees that we have the required SSE2 or
30363 // AVX2 for a shift of this vector type, but there is no vector shift by
30364 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
30365 // masked compare nodes, so they should not make it here.
30366 EVT VT0 = Op0.getValueType();
30367 EVT VT1 = Op1.getValueType();
30368 unsigned EltBitWidth = VT0.getScalarSizeInBits();
30369 if (VT0 != VT1 || EltBitWidth == 8)
30372 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
30375 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
30379 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
30380 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
30381 return DAG.getBitcast(N->getValueType(0), Shift);
30384 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
30385 TargetLowering::DAGCombinerInfo &DCI,
30386 const X86Subtarget &Subtarget) {
30387 if (DCI.isBeforeLegalizeOps())
30390 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30393 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30396 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
30399 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
30402 EVT VT = N->getValueType(0);
30403 SDValue N0 = N->getOperand(0);
30404 SDValue N1 = N->getOperand(1);
30407 // Attempt to recursively combine a bitmask AND with shuffles.
30408 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
30410 SmallVector<int, 1> NonceMask; // Just a placeholder.
30411 NonceMask.push_back(0);
30412 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30413 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30415 return SDValue(); // This routine will use CombineTo to replace N.
30418 // Create BEXTR instructions
30419 // BEXTR is ((X >> imm) & (2**size-1))
30420 if (VT != MVT::i32 && VT != MVT::i64)
30423 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
30425 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
30428 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
30429 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
30430 if (MaskNode && ShiftNode) {
30431 uint64_t Mask = MaskNode->getZExtValue();
30432 uint64_t Shift = ShiftNode->getZExtValue();
30433 if (isMask_64(Mask)) {
30434 uint64_t MaskSize = countPopulation(Mask);
30435 if (Shift + MaskSize <= VT.getSizeInBits())
30436 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
30437 DAG.getConstant(Shift | (MaskSize << 8), DL,
30445 // (or (and (m, y), (pandn m, x)))
30447 // (vselect m, x, y)
30448 // As a special case, try to fold:
30449 // (or (and (m, (sub 0, x)), (pandn m, x)))
30451 // (sub (xor X, M), M)
30452 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
30453 const X86Subtarget &Subtarget) {
30454 assert(N->getOpcode() == ISD::OR);
30456 SDValue N0 = N->getOperand(0);
30457 SDValue N1 = N->getOperand(1);
30458 EVT VT = N->getValueType(0);
30460 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
30462 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
30464 // Canonicalize pandn to RHS
30465 if (N0.getOpcode() == X86ISD::ANDNP)
30468 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
30471 SDValue Mask = N1.getOperand(0);
30472 SDValue X = N1.getOperand(1);
30474 if (N0.getOperand(0) == Mask)
30475 Y = N0.getOperand(1);
30476 if (N0.getOperand(1) == Mask)
30477 Y = N0.getOperand(0);
30479 // Check to see if the mask appeared in both the AND and ANDNP.
30483 // Validate that X, Y, and Mask are bitcasts, and see through them.
30484 Mask = peekThroughBitcasts(Mask);
30485 X = peekThroughBitcasts(X);
30486 Y = peekThroughBitcasts(Y);
30488 EVT MaskVT = Mask.getValueType();
30490 // Validate that the Mask operand is a vector sra node.
30491 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
30492 // there is no psrai.b
30493 unsigned EltBits = MaskVT.getScalarSizeInBits();
30494 unsigned SraAmt = ~0;
30495 if (Mask.getOpcode() == ISD::SRA) {
30496 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
30497 if (auto *AmtConst = AmtBV->getConstantSplatNode())
30498 SraAmt = AmtConst->getZExtValue();
30499 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
30500 SDValue SraC = Mask.getOperand(1);
30501 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
30503 if ((SraAmt + 1) != EltBits)
30509 // (or (and (M, (sub 0, X)), (pandn M, X)))
30510 // which is a special case of vselect:
30511 // (vselect M, (sub 0, X), X)
30513 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
30514 // We know that, if fNegate is 0 or 1:
30515 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
30517 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
30518 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
30519 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
30520 // This lets us transform our vselect to:
30521 // (add (xor X, M), (and M, 1))
30523 // (sub (xor X, M), M)
30524 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
30525 auto IsNegV = [](SDNode *N, SDValue V) {
30526 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
30527 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
30530 if (IsNegV(Y.getNode(), X))
30532 else if (IsNegV(X.getNode(), Y))
30536 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
30537 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
30538 SDValue SubOp2 = Mask;
30540 // If the negate was on the false side of the select, then
30541 // the operands of the SUB need to be swapped. PR 27251.
30542 // This is because the pattern being matched above is
30543 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
30544 // but if the pattern matched was
30545 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
30546 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
30547 // pattern also needs to be a negation of the replacement pattern above.
30548 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
30549 // sub accomplishes the negation of the replacement pattern.
30551 std::swap(SubOp1, SubOp2);
30553 return DAG.getBitcast(VT,
30554 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
30558 // PBLENDVB is only available on SSE 4.1.
30559 if (!Subtarget.hasSSE41())
30562 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
30564 X = DAG.getBitcast(BlendVT, X);
30565 Y = DAG.getBitcast(BlendVT, Y);
30566 Mask = DAG.getBitcast(BlendVT, Mask);
30567 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
30568 return DAG.getBitcast(VT, Mask);
30571 // Helper function for combineOrCmpEqZeroToCtlzSrl
30575 // srl(ctlz x), log2(bitsize(x))
30576 // Input pattern is checked by caller.
30577 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
30578 SelectionDAG &DAG) {
30579 SDValue Cmp = Op.getOperand(1);
30580 EVT VT = Cmp.getOperand(0).getValueType();
30581 unsigned Log2b = Log2_32(VT.getSizeInBits());
30583 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
30584 // The result of the shift is true or false, and on X86, the 32-bit
30585 // encoding of shr and lzcnt is more desirable.
30586 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
30587 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
30588 DAG.getConstant(Log2b, dl, VT));
30589 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
30592 // Try to transform:
30593 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
30595 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
30596 // Will also attempt to match more generic cases, eg:
30597 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
30598 // Only applies if the target supports the FastLZCNT feature.
30599 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
30600 TargetLowering::DAGCombinerInfo &DCI,
30601 const X86Subtarget &Subtarget) {
30602 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
30605 auto isORCandidate = [](SDValue N) {
30606 return (N->getOpcode() == ISD::OR && N->hasOneUse());
30609 // Check the zero extend is extending to 32-bit or more. The code generated by
30610 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
30611 // instructions to clear the upper bits.
30612 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
30613 !isORCandidate(N->getOperand(0)))
30616 // Check the node matches: setcc(eq, cmp 0)
30617 auto isSetCCCandidate = [](SDValue N) {
30618 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
30619 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
30620 N->getOperand(1).getOpcode() == X86ISD::CMP &&
30621 N->getOperand(1).getConstantOperandVal(1) == 0 &&
30622 N->getOperand(1).getValueType().bitsGE(MVT::i32);
30625 SDNode *OR = N->getOperand(0).getNode();
30626 SDValue LHS = OR->getOperand(0);
30627 SDValue RHS = OR->getOperand(1);
30629 // Save nodes matching or(or, setcc(eq, cmp 0)).
30630 SmallVector<SDNode *, 2> ORNodes;
30631 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
30632 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
30633 ORNodes.push_back(OR);
30634 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
30635 LHS = OR->getOperand(0);
30636 RHS = OR->getOperand(1);
30639 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
30640 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
30641 !isORCandidate(SDValue(OR, 0)))
30644 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
30646 // or(srl(ctlz),srl(ctlz)).
30647 // The dag combiner can then fold it into:
30648 // srl(or(ctlz, ctlz)).
30649 EVT VT = OR->getValueType(0);
30650 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
30651 SDValue Ret, NewRHS;
30652 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
30653 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
30658 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
30659 while (ORNodes.size() > 0) {
30660 OR = ORNodes.pop_back_val();
30661 LHS = OR->getOperand(0);
30662 RHS = OR->getOperand(1);
30663 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
30664 if (RHS->getOpcode() == ISD::OR)
30665 std::swap(LHS, RHS);
30666 EVT VT = OR->getValueType(0);
30667 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
30670 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
30674 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
30679 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
30680 TargetLowering::DAGCombinerInfo &DCI,
30681 const X86Subtarget &Subtarget) {
30682 if (DCI.isBeforeLegalizeOps())
30685 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30688 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30691 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
30694 SDValue N0 = N->getOperand(0);
30695 SDValue N1 = N->getOperand(1);
30696 EVT VT = N->getValueType(0);
30698 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
30701 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
30702 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
30704 // SHLD/SHRD instructions have lower register pressure, but on some
30705 // platforms they have higher latency than the equivalent
30706 // series of shifts/or that would otherwise be generated.
30707 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
30708 // have higher latencies and we are not optimizing for size.
30709 if (!OptForSize && Subtarget.isSHLDSlow())
30712 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
30714 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
30716 if (!N0.hasOneUse() || !N1.hasOneUse())
30719 SDValue ShAmt0 = N0.getOperand(1);
30720 if (ShAmt0.getValueType() != MVT::i8)
30722 SDValue ShAmt1 = N1.getOperand(1);
30723 if (ShAmt1.getValueType() != MVT::i8)
30725 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
30726 ShAmt0 = ShAmt0.getOperand(0);
30727 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
30728 ShAmt1 = ShAmt1.getOperand(0);
30731 unsigned Opc = X86ISD::SHLD;
30732 SDValue Op0 = N0.getOperand(0);
30733 SDValue Op1 = N1.getOperand(0);
30734 if (ShAmt0.getOpcode() == ISD::SUB ||
30735 ShAmt0.getOpcode() == ISD::XOR) {
30736 Opc = X86ISD::SHRD;
30737 std::swap(Op0, Op1);
30738 std::swap(ShAmt0, ShAmt1);
30741 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
30742 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
30743 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
30744 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
30745 unsigned Bits = VT.getSizeInBits();
30746 if (ShAmt1.getOpcode() == ISD::SUB) {
30747 SDValue Sum = ShAmt1.getOperand(0);
30748 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
30749 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
30750 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
30751 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
30752 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
30753 return DAG.getNode(Opc, DL, VT,
30755 DAG.getNode(ISD::TRUNCATE, DL,
30758 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
30759 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
30760 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
30761 return DAG.getNode(Opc, DL, VT,
30762 N0.getOperand(0), N1.getOperand(0),
30763 DAG.getNode(ISD::TRUNCATE, DL,
30765 } else if (ShAmt1.getOpcode() == ISD::XOR) {
30766 SDValue Mask = ShAmt1.getOperand(1);
30767 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
30768 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
30769 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
30770 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
30771 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
30772 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
30773 if (Op1.getOpcode() == InnerShift &&
30774 isa<ConstantSDNode>(Op1.getOperand(1)) &&
30775 Op1.getConstantOperandVal(1) == 1) {
30776 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30777 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30779 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
30780 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
30781 Op1.getOperand(0) == Op1.getOperand(1)) {
30782 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
30783 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
30792 /// Generate NEG and CMOV for integer abs.
30793 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
30794 EVT VT = N->getValueType(0);
30796 // Since X86 does not have CMOV for 8-bit integer, we don't convert
30797 // 8-bit integer abs to NEG and CMOV.
30798 if (VT.isInteger() && VT.getSizeInBits() == 8)
30801 SDValue N0 = N->getOperand(0);
30802 SDValue N1 = N->getOperand(1);
30805 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
30806 // and change it to SUB and CMOV.
30807 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
30808 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
30809 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
30810 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
30811 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
30812 // Generate SUB & CMOV.
30813 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
30814 DAG.getConstant(0, DL, VT), N0.getOperand(0));
30815 SDValue Ops[] = {N0.getOperand(0), Neg,
30816 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
30817 SDValue(Neg.getNode(), 1)};
30818 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
30824 /// Try to turn tests against the signbit in the form of:
30825 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
30828 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
30829 // This is only worth doing if the output type is i8 or i1.
30830 EVT ResultType = N->getValueType(0);
30831 if (ResultType != MVT::i8 && ResultType != MVT::i1)
30834 SDValue N0 = N->getOperand(0);
30835 SDValue N1 = N->getOperand(1);
30837 // We should be performing an xor against a truncated shift.
30838 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
30841 // Make sure we are performing an xor against one.
30842 if (!isOneConstant(N1))
30845 // SetCC on x86 zero extends so only act on this if it's a logical shift.
30846 SDValue Shift = N0.getOperand(0);
30847 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
30850 // Make sure we are truncating from one of i16, i32 or i64.
30851 EVT ShiftTy = Shift.getValueType();
30852 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
30855 // Make sure the shift amount extracts the sign bit.
30856 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
30857 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
30860 // Create a greater-than comparison against -1.
30861 // N.B. Using SETGE against 0 works but we want a canonical looking
30862 // comparison, using SETGT matches up with what TranslateX86CC.
30864 SDValue ShiftOp = Shift.getOperand(0);
30865 EVT ShiftOpTy = ShiftOp.getValueType();
30866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30867 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
30868 *DAG.getContext(), ResultType);
30869 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
30870 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
30871 if (SetCCResultType != ResultType)
30872 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
30876 /// Turn vector tests of the signbit in the form of:
30877 /// xor (sra X, elt_size(X)-1), -1
30881 /// This should be called before type legalization because the pattern may not
30882 /// persist after that.
30883 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
30884 const X86Subtarget &Subtarget) {
30885 EVT VT = N->getValueType(0);
30886 if (!VT.isSimple())
30889 switch (VT.getSimpleVT().SimpleTy) {
30890 default: return SDValue();
30893 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
30894 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
30898 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
30901 // There must be a shift right algebraic before the xor, and the xor must be a
30902 // 'not' operation.
30903 SDValue Shift = N->getOperand(0);
30904 SDValue Ones = N->getOperand(1);
30905 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
30906 !ISD::isBuildVectorAllOnes(Ones.getNode()))
30909 // The shift should be smearing the sign bit across each vector element.
30910 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
30914 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
30915 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
30916 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
30919 // Create a greater-than comparison against -1. We don't use the more obvious
30920 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
30921 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
30924 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
30925 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
30926 /// X86ISD::AVG instruction.
30927 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
30928 const X86Subtarget &Subtarget,
30930 if (!VT.isVector() || !VT.isSimple())
30932 EVT InVT = In.getValueType();
30933 unsigned NumElems = VT.getVectorNumElements();
30935 EVT ScalarVT = VT.getVectorElementType();
30936 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
30937 isPowerOf2_32(NumElems)))
30940 // InScalarVT is the intermediate type in AVG pattern and it should be greater
30941 // than the original input type (i8/i16).
30942 EVT InScalarVT = InVT.getVectorElementType();
30943 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
30946 if (!Subtarget.hasSSE2())
30948 if (Subtarget.hasBWI()) {
30949 if (VT.getSizeInBits() > 512)
30951 } else if (Subtarget.hasAVX2()) {
30952 if (VT.getSizeInBits() > 256)
30955 if (VT.getSizeInBits() > 128)
30959 // Detect the following pattern:
30961 // %1 = zext <N x i8> %a to <N x i32>
30962 // %2 = zext <N x i8> %b to <N x i32>
30963 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
30964 // %4 = add nuw nsw <N x i32> %3, %2
30965 // %5 = lshr <N x i32> %N, <i32 1 x N>
30966 // %6 = trunc <N x i32> %5 to <N x i8>
30968 // In AVX512, the last instruction can also be a trunc store.
30970 if (In.getOpcode() != ISD::SRL)
30973 // A lambda checking the given SDValue is a constant vector and each element
30974 // is in the range [Min, Max].
30975 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
30976 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
30977 if (!BV || !BV->isConstant())
30979 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
30980 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
30983 uint64_t Val = C->getZExtValue();
30984 if (Val < Min || Val > Max)
30990 // Check if each element of the vector is left-shifted by one.
30991 auto LHS = In.getOperand(0);
30992 auto RHS = In.getOperand(1);
30993 if (!IsConstVectorInRange(RHS, 1, 1))
30995 if (LHS.getOpcode() != ISD::ADD)
30998 // Detect a pattern of a + b + 1 where the order doesn't matter.
30999 SDValue Operands[3];
31000 Operands[0] = LHS.getOperand(0);
31001 Operands[1] = LHS.getOperand(1);
31003 // Take care of the case when one of the operands is a constant vector whose
31004 // element is in the range [1, 256].
31005 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
31006 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
31007 Operands[0].getOperand(0).getValueType() == VT) {
31008 // The pattern is detected. Subtract one from the constant vector, then
31009 // demote it and emit X86ISD::AVG instruction.
31010 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
31011 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
31012 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
31013 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31017 if (Operands[0].getOpcode() == ISD::ADD)
31018 std::swap(Operands[0], Operands[1]);
31019 else if (Operands[1].getOpcode() != ISD::ADD)
31021 Operands[2] = Operands[1].getOperand(0);
31022 Operands[1] = Operands[1].getOperand(1);
31024 // Now we have three operands of two additions. Check that one of them is a
31025 // constant vector with ones, and the other two are promoted from i8/i16.
31026 for (int i = 0; i < 3; ++i) {
31027 if (!IsConstVectorInRange(Operands[i], 1, 1))
31029 std::swap(Operands[i], Operands[2]);
31031 // Check if Operands[0] and Operands[1] are results of type promotion.
31032 for (int j = 0; j < 2; ++j)
31033 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
31034 Operands[j].getOperand(0).getValueType() != VT)
31037 // The pattern is detected, emit X86ISD::AVG instruction.
31038 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31039 Operands[1].getOperand(0));
31045 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
31046 TargetLowering::DAGCombinerInfo &DCI,
31047 const X86Subtarget &Subtarget) {
31048 LoadSDNode *Ld = cast<LoadSDNode>(N);
31049 EVT RegVT = Ld->getValueType(0);
31050 EVT MemVT = Ld->getMemoryVT();
31052 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31054 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
31055 // into two 16-byte operations.
31056 ISD::LoadExtType Ext = Ld->getExtensionType();
31058 unsigned AddressSpace = Ld->getAddressSpace();
31059 unsigned Alignment = Ld->getAlignment();
31060 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
31061 Ext == ISD::NON_EXTLOAD &&
31062 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
31063 AddressSpace, Alignment, &Fast) && !Fast) {
31064 unsigned NumElems = RegVT.getVectorNumElements();
31068 SDValue Ptr = Ld->getBasePtr();
31070 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
31073 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31074 Alignment, Ld->getMemOperand()->getFlags());
31076 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
31078 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31079 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
31080 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31082 Load2.getValue(1));
31084 SDValue NewVec = DAG.getUNDEF(RegVT);
31085 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
31086 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
31087 return DCI.CombineTo(N, NewVec, TF, true);
31093 /// If V is a build vector of boolean constants and exactly one of those
31094 /// constants is true, return the operand index of that true element.
31095 /// Otherwise, return -1.
31096 static int getOneTrueElt(SDValue V) {
31097 // This needs to be a build vector of booleans.
31098 // TODO: Checking for the i1 type matches the IR definition for the mask,
31099 // but the mask check could be loosened to i8 or other types. That might
31100 // also require checking more than 'allOnesValue'; eg, the x86 HW
31101 // instructions only require that the MSB is set for each mask element.
31102 // The ISD::MSTORE comments/definition do not specify how the mask operand
31104 auto *BV = dyn_cast<BuildVectorSDNode>(V);
31105 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
31108 int TrueIndex = -1;
31109 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
31110 for (unsigned i = 0; i < NumElts; ++i) {
31111 const SDValue &Op = BV->getOperand(i);
31114 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
31117 if (ConstNode->getAPIntValue().isAllOnesValue()) {
31118 // If we already found a one, this is too many.
31119 if (TrueIndex >= 0)
31127 /// Given a masked memory load/store operation, return true if it has one mask
31128 /// bit set. If it has one mask bit set, then also return the memory address of
31129 /// the scalar element to load/store, the vector index to insert/extract that
31130 /// scalar element, and the alignment for the scalar memory access.
31131 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
31132 SelectionDAG &DAG, SDValue &Addr,
31133 SDValue &Index, unsigned &Alignment) {
31134 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
31135 if (TrueMaskElt < 0)
31138 // Get the address of the one scalar element that is specified by the mask
31139 // using the appropriate offset from the base pointer.
31140 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
31141 Addr = MaskedOp->getBasePtr();
31142 if (TrueMaskElt != 0) {
31143 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
31144 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
31147 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
31148 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
31152 /// If exactly one element of the mask is set for a non-extending masked load,
31153 /// it is a scalar load and vector insert.
31154 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31155 /// mask have already been optimized in IR, so we don't bother with those here.
31157 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31158 TargetLowering::DAGCombinerInfo &DCI) {
31159 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31160 // However, some target hooks may need to be added to know when the transform
31161 // is profitable. Endianness would also have to be considered.
31163 SDValue Addr, VecIndex;
31164 unsigned Alignment;
31165 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
31168 // Load the one scalar element that is specified by the mask using the
31169 // appropriate offset from the base pointer.
31171 EVT VT = ML->getValueType(0);
31172 EVT EltVT = VT.getVectorElementType();
31174 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
31175 Alignment, ML->getMemOperand()->getFlags());
31177 // Insert the loaded element into the appropriate place in the vector.
31178 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
31180 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
31184 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31185 TargetLowering::DAGCombinerInfo &DCI) {
31186 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
31190 EVT VT = ML->getValueType(0);
31192 // If we are loading the first and last elements of a vector, it is safe and
31193 // always faster to load the whole vector. Replace the masked load with a
31194 // vector load and select.
31195 unsigned NumElts = VT.getVectorNumElements();
31196 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
31197 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
31198 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
31199 if (LoadFirstElt && LoadLastElt) {
31200 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31201 ML->getMemOperand());
31202 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
31203 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
31206 // Convert a masked load with a constant mask into a masked load and a select.
31207 // This allows the select operation to use a faster kind of select instruction
31208 // (for example, vblendvps -> vblendps).
31210 // Don't try this if the pass-through operand is already undefined. That would
31211 // cause an infinite loop because that's what we're about to create.
31212 if (ML->getSrc0().isUndef())
31215 // The new masked load has an undef pass-through operand. The select uses the
31216 // original pass-through operand.
31217 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31218 ML->getMask(), DAG.getUNDEF(VT),
31219 ML->getMemoryVT(), ML->getMemOperand(),
31220 ML->getExtensionType());
31221 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
31223 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
31226 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
31227 TargetLowering::DAGCombinerInfo &DCI,
31228 const X86Subtarget &Subtarget) {
31229 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
31231 // TODO: Expanding load with constant mask may be optimized as well.
31232 if (Mld->isExpandingLoad())
31235 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
31236 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
31238 // TODO: Do some AVX512 subsets benefit from this transform?
31239 if (!Subtarget.hasAVX512())
31240 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
31244 if (Mld->getExtensionType() != ISD::SEXTLOAD)
31247 // Resolve extending loads.
31248 EVT VT = Mld->getValueType(0);
31249 unsigned NumElems = VT.getVectorNumElements();
31250 EVT LdVT = Mld->getMemoryVT();
31253 assert(LdVT != VT && "Cannot extend to the same type");
31254 unsigned ToSz = VT.getScalarSizeInBits();
31255 unsigned FromSz = LdVT.getScalarSizeInBits();
31256 // From/To sizes and ElemCount must be pow of two.
31257 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31258 "Unexpected size for extending masked load");
31260 unsigned SizeRatio = ToSz / FromSz;
31261 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
31263 // Create a type on which we perform the shuffle.
31264 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31265 LdVT.getScalarType(), NumElems*SizeRatio);
31266 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31268 // Convert Src0 value.
31269 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
31270 if (!Mld->getSrc0().isUndef()) {
31271 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31272 for (unsigned i = 0; i != NumElems; ++i)
31273 ShuffleVec[i] = i * SizeRatio;
31275 // Can't shuffle using an illegal type.
31276 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31277 "WideVecVT should be legal");
31278 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
31279 DAG.getUNDEF(WideVecVT), ShuffleVec);
31281 // Prepare the new mask.
31283 SDValue Mask = Mld->getMask();
31284 if (Mask.getValueType() == VT) {
31285 // Mask and original value have the same type.
31286 NewMask = DAG.getBitcast(WideVecVT, Mask);
31287 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31288 for (unsigned i = 0; i != NumElems; ++i)
31289 ShuffleVec[i] = i * SizeRatio;
31290 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
31291 ShuffleVec[i] = NumElems * SizeRatio;
31292 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31293 DAG.getConstant(0, dl, WideVecVT),
31296 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31297 unsigned WidenNumElts = NumElems*SizeRatio;
31298 unsigned MaskNumElts = VT.getVectorNumElements();
31299 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31302 unsigned NumConcat = WidenNumElts / MaskNumElts;
31303 SmallVector<SDValue, 16> Ops(NumConcat);
31304 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31306 for (unsigned i = 1; i != NumConcat; ++i)
31309 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31312 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
31313 Mld->getBasePtr(), NewMask, WideSrc0,
31314 Mld->getMemoryVT(), Mld->getMemOperand(),
31316 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
31317 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
31320 /// If exactly one element of the mask is set for a non-truncating masked store,
31321 /// it is a vector extract and scalar store.
31322 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31323 /// mask have already been optimized in IR, so we don't bother with those here.
31324 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
31325 SelectionDAG &DAG) {
31326 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31327 // However, some target hooks may need to be added to know when the transform
31328 // is profitable. Endianness would also have to be considered.
31330 SDValue Addr, VecIndex;
31331 unsigned Alignment;
31332 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
31335 // Extract the one scalar element that is actually being stored.
31337 EVT VT = MS->getValue().getValueType();
31338 EVT EltVT = VT.getVectorElementType();
31339 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
31340 MS->getValue(), VecIndex);
31342 // Store that element at the appropriate offset from the base pointer.
31343 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
31344 Alignment, MS->getMemOperand()->getFlags());
31347 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
31348 const X86Subtarget &Subtarget) {
31349 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
31351 if (Mst->isCompressingStore())
31354 if (!Mst->isTruncatingStore())
31355 return reduceMaskedStoreToScalarStore(Mst, DAG);
31357 // Resolve truncating stores.
31358 EVT VT = Mst->getValue().getValueType();
31359 unsigned NumElems = VT.getVectorNumElements();
31360 EVT StVT = Mst->getMemoryVT();
31363 assert(StVT != VT && "Cannot truncate to the same type");
31364 unsigned FromSz = VT.getScalarSizeInBits();
31365 unsigned ToSz = StVT.getScalarSizeInBits();
31367 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31369 // The truncating store is legal in some cases. For example
31370 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31371 // are designated for truncate store.
31372 // In this case we don't need any further transformations.
31373 if (TLI.isTruncStoreLegal(VT, StVT))
31376 // From/To sizes and ElemCount must be pow of two.
31377 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31378 "Unexpected size for truncating masked store");
31379 // We are going to use the original vector elt for storing.
31380 // Accumulated smaller vector elements must be a multiple of the store size.
31381 assert (((NumElems * FromSz) % ToSz) == 0 &&
31382 "Unexpected ratio for truncating masked store");
31384 unsigned SizeRatio = FromSz / ToSz;
31385 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31387 // Create a type on which we perform the shuffle.
31388 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31389 StVT.getScalarType(), NumElems*SizeRatio);
31391 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31393 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
31394 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31395 for (unsigned i = 0; i != NumElems; ++i)
31396 ShuffleVec[i] = i * SizeRatio;
31398 // Can't shuffle using an illegal type.
31399 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31400 "WideVecVT should be legal");
31402 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31403 DAG.getUNDEF(WideVecVT),
31407 SDValue Mask = Mst->getMask();
31408 if (Mask.getValueType() == VT) {
31409 // Mask and original value have the same type.
31410 NewMask = DAG.getBitcast(WideVecVT, Mask);
31411 for (unsigned i = 0; i != NumElems; ++i)
31412 ShuffleVec[i] = i * SizeRatio;
31413 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
31414 ShuffleVec[i] = NumElems*SizeRatio;
31415 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31416 DAG.getConstant(0, dl, WideVecVT),
31419 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31420 unsigned WidenNumElts = NumElems*SizeRatio;
31421 unsigned MaskNumElts = VT.getVectorNumElements();
31422 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31425 unsigned NumConcat = WidenNumElts / MaskNumElts;
31426 SmallVector<SDValue, 16> Ops(NumConcat);
31427 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31429 for (unsigned i = 1; i != NumConcat; ++i)
31432 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31435 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
31436 Mst->getBasePtr(), NewMask, StVT,
31437 Mst->getMemOperand(), false);
31440 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
31441 const X86Subtarget &Subtarget) {
31442 StoreSDNode *St = cast<StoreSDNode>(N);
31443 EVT VT = St->getValue().getValueType();
31444 EVT StVT = St->getMemoryVT();
31446 SDValue StoredVal = St->getOperand(1);
31447 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31449 // If we are saving a concatenation of two XMM registers and 32-byte stores
31450 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
31452 unsigned AddressSpace = St->getAddressSpace();
31453 unsigned Alignment = St->getAlignment();
31454 if (VT.is256BitVector() && StVT == VT &&
31455 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
31456 AddressSpace, Alignment, &Fast) &&
31458 unsigned NumElems = VT.getVectorNumElements();
31462 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
31463 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
31465 SDValue Ptr0 = St->getBasePtr();
31466 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
31469 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
31470 Alignment, St->getMemOperand()->getFlags());
31472 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
31473 std::min(16U, Alignment), St->getMemOperand()->getFlags());
31474 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
31477 // Optimize trunc store (of multiple scalars) to shuffle and store.
31478 // First, pack all of the elements in one place. Next, store to memory
31479 // in fewer chunks.
31480 if (St->isTruncatingStore() && VT.isVector()) {
31481 // Check if we can detect an AVG pattern from the truncation. If yes,
31482 // replace the trunc store by a normal store with the result of X86ISD::AVG
31484 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
31486 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
31487 St->getPointerInfo(), St->getAlignment(),
31488 St->getMemOperand()->getFlags());
31490 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31491 unsigned NumElems = VT.getVectorNumElements();
31492 assert(StVT != VT && "Cannot truncate to the same type");
31493 unsigned FromSz = VT.getScalarSizeInBits();
31494 unsigned ToSz = StVT.getScalarSizeInBits();
31496 // The truncating store is legal in some cases. For example
31497 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31498 // are designated for truncate store.
31499 // In this case we don't need any further transformations.
31500 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
31503 // From, To sizes and ElemCount must be pow of two
31504 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
31505 // We are going to use the original vector elt for storing.
31506 // Accumulated smaller vector elements must be a multiple of the store size.
31507 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
31509 unsigned SizeRatio = FromSz / ToSz;
31511 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31513 // Create a type on which we perform the shuffle
31514 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31515 StVT.getScalarType(), NumElems*SizeRatio);
31517 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31519 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
31520 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
31521 for (unsigned i = 0; i != NumElems; ++i)
31522 ShuffleVec[i] = i * SizeRatio;
31524 // Can't shuffle using an illegal type.
31525 if (!TLI.isTypeLegal(WideVecVT))
31528 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31529 DAG.getUNDEF(WideVecVT),
31531 // At this point all of the data is stored at the bottom of the
31532 // register. We now need to save it to mem.
31534 // Find the largest store unit
31535 MVT StoreType = MVT::i8;
31536 for (MVT Tp : MVT::integer_valuetypes()) {
31537 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
31541 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
31542 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
31543 (64 <= NumElems * ToSz))
31544 StoreType = MVT::f64;
31546 // Bitcast the original vector into a vector of store-size units
31547 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
31548 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
31549 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
31550 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
31551 SmallVector<SDValue, 8> Chains;
31552 SDValue Ptr = St->getBasePtr();
31554 // Perform one or more big stores into memory.
31555 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
31556 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
31557 StoreType, ShuffWide,
31558 DAG.getIntPtrConstant(i, dl));
31560 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
31561 St->getAlignment(), St->getMemOperand()->getFlags());
31562 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
31563 Chains.push_back(Ch);
31566 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
31569 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
31570 // the FP state in cases where an emms may be missing.
31571 // A preferable solution to the general problem is to figure out the right
31572 // places to insert EMMS. This qualifies as a quick hack.
31574 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
31575 if (VT.getSizeInBits() != 64)
31578 const Function *F = DAG.getMachineFunction().getFunction();
31579 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
31581 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
31582 if ((VT.isVector() ||
31583 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
31584 isa<LoadSDNode>(St->getValue()) &&
31585 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
31586 St->getChain().hasOneUse() && !St->isVolatile()) {
31587 SDNode* LdVal = St->getValue().getNode();
31588 LoadSDNode *Ld = nullptr;
31589 int TokenFactorIndex = -1;
31590 SmallVector<SDValue, 8> Ops;
31591 SDNode* ChainVal = St->getChain().getNode();
31592 // Must be a store of a load. We currently handle two cases: the load
31593 // is a direct child, and it's under an intervening TokenFactor. It is
31594 // possible to dig deeper under nested TokenFactors.
31595 if (ChainVal == LdVal)
31596 Ld = cast<LoadSDNode>(St->getChain());
31597 else if (St->getValue().hasOneUse() &&
31598 ChainVal->getOpcode() == ISD::TokenFactor) {
31599 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
31600 if (ChainVal->getOperand(i).getNode() == LdVal) {
31601 TokenFactorIndex = i;
31602 Ld = cast<LoadSDNode>(St->getValue());
31604 Ops.push_back(ChainVal->getOperand(i));
31608 if (!Ld || !ISD::isNormalLoad(Ld))
31611 // If this is not the MMX case, i.e. we are just turning i64 load/store
31612 // into f64 load/store, avoid the transformation if there are multiple
31613 // uses of the loaded value.
31614 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
31619 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
31620 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
31622 if (Subtarget.is64Bit() || F64IsLegal) {
31623 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
31624 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
31625 Ld->getPointerInfo(), Ld->getAlignment(),
31626 Ld->getMemOperand()->getFlags());
31627 SDValue NewChain = NewLd.getValue(1);
31628 if (TokenFactorIndex >= 0) {
31629 Ops.push_back(NewChain);
31630 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31632 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
31633 St->getPointerInfo(), St->getAlignment(),
31634 St->getMemOperand()->getFlags());
31637 // Otherwise, lower to two pairs of 32-bit loads / stores.
31638 SDValue LoAddr = Ld->getBasePtr();
31639 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
31641 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
31642 Ld->getPointerInfo(), Ld->getAlignment(),
31643 Ld->getMemOperand()->getFlags());
31644 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
31645 Ld->getPointerInfo().getWithOffset(4),
31646 MinAlign(Ld->getAlignment(), 4),
31647 Ld->getMemOperand()->getFlags());
31649 SDValue NewChain = LoLd.getValue(1);
31650 if (TokenFactorIndex >= 0) {
31651 Ops.push_back(LoLd);
31652 Ops.push_back(HiLd);
31653 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
31656 LoAddr = St->getBasePtr();
31657 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
31660 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
31661 St->getAlignment(), St->getMemOperand()->getFlags());
31662 SDValue HiSt = DAG.getStore(
31663 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
31664 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
31665 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
31668 // This is similar to the above case, but here we handle a scalar 64-bit
31669 // integer store that is extracted from a vector on a 32-bit target.
31670 // If we have SSE2, then we can treat it like a floating-point double
31671 // to get past legalization. The execution dependencies fixup pass will
31672 // choose the optimal machine instruction for the store if this really is
31673 // an integer or v2f32 rather than an f64.
31674 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
31675 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
31676 SDValue OldExtract = St->getOperand(1);
31677 SDValue ExtOp0 = OldExtract.getOperand(0);
31678 unsigned VecSize = ExtOp0.getValueSizeInBits();
31679 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
31680 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
31681 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
31682 BitCast, OldExtract.getOperand(1));
31683 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
31684 St->getPointerInfo(), St->getAlignment(),
31685 St->getMemOperand()->getFlags());
31691 /// Return 'true' if this vector operation is "horizontal"
31692 /// and return the operands for the horizontal operation in LHS and RHS. A
31693 /// horizontal operation performs the binary operation on successive elements
31694 /// of its first operand, then on successive elements of its second operand,
31695 /// returning the resulting values in a vector. For example, if
31696 /// A = < float a0, float a1, float a2, float a3 >
31698 /// B = < float b0, float b1, float b2, float b3 >
31699 /// then the result of doing a horizontal operation on A and B is
31700 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
31701 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
31702 /// A horizontal-op B, for some already available A and B, and if so then LHS is
31703 /// set to A, RHS to B, and the routine returns 'true'.
31704 /// Note that the binary operation should have the property that if one of the
31705 /// operands is UNDEF then the result is UNDEF.
31706 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
31707 // Look for the following pattern: if
31708 // A = < float a0, float a1, float a2, float a3 >
31709 // B = < float b0, float b1, float b2, float b3 >
31711 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
31712 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
31713 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
31714 // which is A horizontal-op B.
31716 // At least one of the operands should be a vector shuffle.
31717 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
31718 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
31721 MVT VT = LHS.getSimpleValueType();
31723 assert((VT.is128BitVector() || VT.is256BitVector()) &&
31724 "Unsupported vector type for horizontal add/sub");
31726 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
31727 // operate independently on 128-bit lanes.
31728 unsigned NumElts = VT.getVectorNumElements();
31729 unsigned NumLanes = VT.getSizeInBits()/128;
31730 unsigned NumLaneElts = NumElts / NumLanes;
31731 assert((NumLaneElts % 2 == 0) &&
31732 "Vector type should have an even number of elements in each lane");
31733 unsigned HalfLaneElts = NumLaneElts/2;
31735 // View LHS in the form
31736 // LHS = VECTOR_SHUFFLE A, B, LMask
31737 // If LHS is not a shuffle then pretend it is the shuffle
31738 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
31739 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
31742 SmallVector<int, 16> LMask(NumElts);
31743 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31744 if (!LHS.getOperand(0).isUndef())
31745 A = LHS.getOperand(0);
31746 if (!LHS.getOperand(1).isUndef())
31747 B = LHS.getOperand(1);
31748 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
31749 std::copy(Mask.begin(), Mask.end(), LMask.begin());
31751 if (!LHS.isUndef())
31753 for (unsigned i = 0; i != NumElts; ++i)
31757 // Likewise, view RHS in the form
31758 // RHS = VECTOR_SHUFFLE C, D, RMask
31760 SmallVector<int, 16> RMask(NumElts);
31761 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
31762 if (!RHS.getOperand(0).isUndef())
31763 C = RHS.getOperand(0);
31764 if (!RHS.getOperand(1).isUndef())
31765 D = RHS.getOperand(1);
31766 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
31767 std::copy(Mask.begin(), Mask.end(), RMask.begin());
31769 if (!RHS.isUndef())
31771 for (unsigned i = 0; i != NumElts; ++i)
31775 // Check that the shuffles are both shuffling the same vectors.
31776 if (!(A == C && B == D) && !(A == D && B == C))
31779 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
31780 if (!A.getNode() && !B.getNode())
31783 // If A and B occur in reverse order in RHS, then "swap" them (which means
31784 // rewriting the mask).
31786 ShuffleVectorSDNode::commuteMask(RMask);
31788 // At this point LHS and RHS are equivalent to
31789 // LHS = VECTOR_SHUFFLE A, B, LMask
31790 // RHS = VECTOR_SHUFFLE A, B, RMask
31791 // Check that the masks correspond to performing a horizontal operation.
31792 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
31793 for (unsigned i = 0; i != NumLaneElts; ++i) {
31794 int LIdx = LMask[i+l], RIdx = RMask[i+l];
31796 // Ignore any UNDEF components.
31797 if (LIdx < 0 || RIdx < 0 ||
31798 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
31799 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
31802 // Check that successive elements are being operated on. If not, this is
31803 // not a horizontal operation.
31804 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
31805 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
31806 if (!(LIdx == Index && RIdx == Index + 1) &&
31807 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
31812 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
31813 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
31817 /// Do target-specific dag combines on floating-point adds/subs.
31818 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
31819 const X86Subtarget &Subtarget) {
31820 EVT VT = N->getValueType(0);
31821 SDValue LHS = N->getOperand(0);
31822 SDValue RHS = N->getOperand(1);
31823 bool IsFadd = N->getOpcode() == ISD::FADD;
31824 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
31826 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
31827 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
31828 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
31829 isHorizontalBinOp(LHS, RHS, IsFadd)) {
31830 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
31831 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
31836 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
31838 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
31839 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
31840 const X86Subtarget &Subtarget,
31842 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
31843 SDValue Src = N->getOperand(0);
31844 unsigned Opcode = Src.getOpcode();
31845 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31847 EVT VT = N->getValueType(0);
31848 EVT SrcVT = Src.getValueType();
31850 auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
31851 // TODO: Add extra cases where we can truncate both inputs for the
31852 // cost of one (or none).
31853 // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
31857 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
31858 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
31859 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
31860 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
31863 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
31864 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
31865 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
31866 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
31869 // Don't combine if the operation has other uses.
31870 if (!N->isOnlyUserOf(Src.getNode()))
31873 // Only support vector truncation for now.
31874 // TODO: i64 scalar math would benefit as well.
31875 if (!VT.isVector())
31878 // In most cases its only worth pre-truncating if we're only facing the cost
31879 // of one truncation.
31880 // i.e. if one of the inputs will constant fold or the input is repeated.
31885 SDValue Op0 = Src.getOperand(0);
31886 SDValue Op1 = Src.getOperand(1);
31887 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
31888 IsRepeatedOpOrOneUseConstant(Op0, Op1))
31889 return TruncateArithmetic(Op0, Op1);
31894 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
31895 // better to truncate if we have the chance.
31896 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
31897 !TLI.isOperationLegal(Opcode, SrcVT))
31898 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
31901 SDValue Op0 = Src.getOperand(0);
31902 SDValue Op1 = Src.getOperand(1);
31903 if (TLI.isOperationLegal(Opcode, VT) &&
31904 IsRepeatedOpOrOneUseConstant(Op0, Op1))
31905 return TruncateArithmetic(Op0, Op1);
31913 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
31915 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
31916 SmallVector<SDValue, 8> &Regs) {
31917 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
31918 Regs[0].getValueType() == MVT::v2i64));
31919 EVT OutVT = N->getValueType(0);
31920 EVT OutSVT = OutVT.getVectorElementType();
31921 EVT InVT = Regs[0].getValueType();
31922 EVT InSVT = InVT.getVectorElementType();
31925 // First, use mask to unset all bits that won't appear in the result.
31926 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
31927 "OutSVT can only be either i8 or i16.");
31929 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
31930 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
31931 for (auto &Reg : Regs)
31932 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
31934 MVT UnpackedVT, PackedVT;
31935 if (OutSVT == MVT::i8) {
31936 UnpackedVT = MVT::v8i16;
31937 PackedVT = MVT::v16i8;
31939 UnpackedVT = MVT::v4i32;
31940 PackedVT = MVT::v8i16;
31943 // In each iteration, truncate the type by a half size.
31944 auto RegNum = Regs.size();
31945 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
31946 j < e; j *= 2, RegNum /= 2) {
31947 for (unsigned i = 0; i < RegNum; i++)
31948 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
31949 for (unsigned i = 0; i < RegNum / 2; i++)
31950 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
31954 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
31955 // then extract a subvector as the result since v8i8 is not a legal type.
31956 if (OutVT == MVT::v8i8) {
31957 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
31958 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
31959 DAG.getIntPtrConstant(0, DL));
31961 } else if (RegNum > 1) {
31962 Regs.resize(RegNum);
31963 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
31968 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
31970 combineVectorTruncationWithPACKSS(SDNode *N, SelectionDAG &DAG,
31971 SmallVector<SDValue, 8> &Regs) {
31972 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
31973 EVT OutVT = N->getValueType(0);
31976 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
31977 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
31978 for (auto &Reg : Regs) {
31979 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt, DAG);
31980 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt, DAG);
31983 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
31984 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
31987 if (Regs.size() > 2) {
31988 Regs.resize(Regs.size() / 2);
31989 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
31994 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
31995 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
31996 /// legalization the truncation will be translated into a BUILD_VECTOR with each
31997 /// element that is extracted from a vector and then truncated, and it is
31998 /// difficult to do this optimization based on them.
31999 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
32000 const X86Subtarget &Subtarget) {
32001 EVT OutVT = N->getValueType(0);
32002 if (!OutVT.isVector())
32005 SDValue In = N->getOperand(0);
32006 if (!In.getValueType().isSimple())
32009 EVT InVT = In.getValueType();
32010 unsigned NumElems = OutVT.getVectorNumElements();
32012 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
32013 // SSE2, and we need to take care of it specially.
32014 // AVX512 provides vpmovdb.
32015 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
32018 EVT OutSVT = OutVT.getVectorElementType();
32019 EVT InSVT = InVT.getVectorElementType();
32020 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
32021 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
32025 // SSSE3's pshufb results in less instructions in the cases below.
32026 if (Subtarget.hasSSSE3() && NumElems == 8 &&
32027 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
32028 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
32033 // Split a long vector into vectors of legal type.
32034 unsigned RegNum = InVT.getSizeInBits() / 128;
32035 SmallVector<SDValue, 8> SubVec(RegNum);
32036 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
32037 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
32039 for (unsigned i = 0; i < RegNum; i++)
32040 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
32041 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
32043 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
32044 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
32045 // truncate 2 x v4i32 to v8i16.
32046 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
32047 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
32048 else if (InSVT == MVT::i32)
32049 return combineVectorTruncationWithPACKSS(N, DAG, SubVec);
32054 /// This function transforms vector truncation of 'all or none' bits values.
32055 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
32056 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
32058 const X86Subtarget &Subtarget) {
32059 // Requires SSE2 but AVX512 has fast truncate.
32060 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
32063 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
32066 SDValue In = N->getOperand(0);
32067 if (!In.getValueType().isSimple())
32070 MVT VT = N->getValueType(0).getSimpleVT();
32071 MVT SVT = VT.getScalarType();
32073 MVT InVT = In.getValueType().getSimpleVT();
32074 MVT InSVT = InVT.getScalarType();
32076 // Use PACKSS if the input is a splatted sign bit.
32077 // e.g. Comparison result, sext_in_reg, etc.
32078 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
32079 if (NumSignBits != InSVT.getSizeInBits())
32082 // Check we have a truncation suited for PACKSS.
32083 if (!VT.is128BitVector() && !VT.is256BitVector())
32085 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
32087 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
32090 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
32093 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
32094 const X86Subtarget &Subtarget) {
32095 EVT VT = N->getValueType(0);
32096 SDValue Src = N->getOperand(0);
32099 // Attempt to pre-truncate inputs to arithmetic ops instead.
32100 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
32103 // Try to detect AVG pattern first.
32104 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
32107 // The bitcast source is a direct mmx result.
32108 // Detect bitcasts between i32 to x86mmx
32109 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
32110 SDValue BCSrc = Src.getOperand(0);
32111 if (BCSrc.getValueType() == MVT::x86mmx)
32112 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
32115 // Try to truncate extended sign bits with PACKSS.
32116 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
32119 return combineVectorTruncation(N, DAG, Subtarget);
32122 /// Returns the negated value if the node \p N flips sign of FP value.
32124 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
32125 /// AVX512F does not have FXOR, so FNEG is lowered as
32126 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
32127 /// In this case we go though all bitcasts.
32128 static SDValue isFNEG(SDNode *N) {
32129 if (N->getOpcode() == ISD::FNEG)
32130 return N->getOperand(0);
32132 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
32133 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
32136 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
32137 if (!Op1.getValueType().isFloatingPoint())
32140 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
32142 unsigned EltBits = Op1.getScalarValueSizeInBits();
32143 auto isSignBitValue = [&](const ConstantFP *C) {
32144 return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
32147 // There is more than one way to represent the same constant on
32148 // the different X86 targets. The type of the node may also depend on size.
32149 // - load scalar value and broadcast
32150 // - BUILD_VECTOR node
32151 // - load from a constant pool.
32152 // We check all variants here.
32153 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
32154 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
32155 if (isSignBitValue(cast<ConstantFP>(C)))
32158 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
32159 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
32160 if (isSignBitValue(CN->getConstantFPValue()))
32163 } else if (auto *C = getTargetConstantFromNode(Op1)) {
32164 if (C->getType()->isVectorTy()) {
32165 if (auto *SplatV = C->getSplatValue())
32166 if (isSignBitValue(cast<ConstantFP>(SplatV)))
32168 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
32169 if (isSignBitValue(FPConst))
32175 /// Do target-specific dag combines on floating point negations.
32176 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
32177 const X86Subtarget &Subtarget) {
32178 EVT OrigVT = N->getValueType(0);
32179 SDValue Arg = isFNEG(N);
32180 assert(Arg.getNode() && "N is expected to be an FNEG node");
32182 EVT VT = Arg.getValueType();
32183 EVT SVT = VT.getScalarType();
32186 // Let legalize expand this if it isn't a legal type yet.
32187 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32190 // If we're negating a FMUL node on a target with FMA, then we can avoid the
32191 // use of a constant by performing (-0 - A*B) instead.
32192 // FIXME: Check rounding control flags as well once it becomes available.
32193 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
32194 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
32195 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
32196 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
32197 Arg.getOperand(1), Zero);
32198 return DAG.getBitcast(OrigVT, NewNode);
32201 // If we're negating an FMA node, then we can adjust the
32202 // instruction to include the extra negation.
32203 unsigned NewOpcode = 0;
32204 if (Arg.hasOneUse()) {
32205 switch (Arg.getOpcode()) {
32206 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
32207 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
32208 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
32209 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
32210 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
32211 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
32212 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
32213 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
32214 // We can't handle scalar intrinsic node here because it would only
32215 // invert one element and not the whole vector. But we could try to handle
32216 // a negation of the lower element only.
32220 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
32221 Arg.getNode()->ops()));
32226 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
32227 const X86Subtarget &Subtarget) {
32228 MVT VT = N->getSimpleValueType(0);
32229 // If we have integer vector types available, use the integer opcodes.
32230 if (VT.isVector() && Subtarget.hasSSE2()) {
32233 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
32235 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
32236 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
32237 unsigned IntOpcode;
32238 switch (N->getOpcode()) {
32239 default: llvm_unreachable("Unexpected FP logic op");
32240 case X86ISD::FOR: IntOpcode = ISD::OR; break;
32241 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
32242 case X86ISD::FAND: IntOpcode = ISD::AND; break;
32243 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
32245 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
32246 return DAG.getBitcast(VT, IntOp);
32251 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
32252 TargetLowering::DAGCombinerInfo &DCI,
32253 const X86Subtarget &Subtarget) {
32254 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
32257 if (DCI.isBeforeLegalizeOps())
32260 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
32263 if (Subtarget.hasCMov())
32264 if (SDValue RV = combineIntegerAbs(N, DAG))
32267 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32271 return combineFneg(N, DAG, Subtarget);
32276 static bool isNullFPScalarOrVectorConst(SDValue V) {
32277 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
32280 /// If a value is a scalar FP zero or a vector FP zero (potentially including
32281 /// undefined elements), return a zero constant that may be used to fold away
32282 /// that value. In the case of a vector, the returned constant will not contain
32283 /// undefined elements even if the input parameter does. This makes it suitable
32284 /// to be used as a replacement operand with operations (eg, bitwise-and) where
32285 /// an undef should not propagate.
32286 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
32287 const X86Subtarget &Subtarget) {
32288 if (!isNullFPScalarOrVectorConst(V))
32291 if (V.getValueType().isVector())
32292 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
32297 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
32298 const X86Subtarget &Subtarget) {
32299 SDValue N0 = N->getOperand(0);
32300 SDValue N1 = N->getOperand(1);
32301 EVT VT = N->getValueType(0);
32304 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
32305 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
32306 (VT == MVT::f64 && Subtarget.hasSSE2())))
32309 auto isAllOnesConstantFP = [](SDValue V) {
32310 auto *C = dyn_cast<ConstantFPSDNode>(V);
32311 return C && C->getConstantFPValue()->isAllOnesValue();
32314 // fand (fxor X, -1), Y --> fandn X, Y
32315 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
32316 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
32318 // fand X, (fxor Y, -1) --> fandn Y, X
32319 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
32320 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
32325 /// Do target-specific dag combines on X86ISD::FAND nodes.
32326 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
32327 const X86Subtarget &Subtarget) {
32328 // FAND(0.0, x) -> 0.0
32329 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
32332 // FAND(x, 0.0) -> 0.0
32333 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32336 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
32339 return lowerX86FPLogicOp(N, DAG, Subtarget);
32342 /// Do target-specific dag combines on X86ISD::FANDN nodes.
32343 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
32344 const X86Subtarget &Subtarget) {
32345 // FANDN(0.0, x) -> x
32346 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32347 return N->getOperand(1);
32349 // FANDN(x, 0.0) -> 0.0
32350 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32353 return lowerX86FPLogicOp(N, DAG, Subtarget);
32356 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
32357 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
32358 const X86Subtarget &Subtarget) {
32359 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
32361 // F[X]OR(0.0, x) -> x
32362 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32363 return N->getOperand(1);
32365 // F[X]OR(x, 0.0) -> x
32366 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
32367 return N->getOperand(0);
32370 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
32373 return lowerX86FPLogicOp(N, DAG, Subtarget);
32376 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
32377 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
32378 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
32380 // Only perform optimizations if UnsafeMath is used.
32381 if (!DAG.getTarget().Options.UnsafeFPMath)
32384 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
32385 // into FMINC and FMAXC, which are Commutative operations.
32386 unsigned NewOp = 0;
32387 switch (N->getOpcode()) {
32388 default: llvm_unreachable("unknown opcode");
32389 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
32390 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
32393 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
32394 N->getOperand(0), N->getOperand(1));
32397 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
32398 const X86Subtarget &Subtarget) {
32399 if (Subtarget.useSoftFloat())
32402 // TODO: Check for global or instruction-level "nnan". In that case, we
32403 // should be able to lower to FMAX/FMIN alone.
32404 // TODO: If an operand is already known to be a NaN or not a NaN, this
32405 // should be an optional swap and FMAX/FMIN.
32407 EVT VT = N->getValueType(0);
32408 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
32409 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
32410 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
32413 // This takes at least 3 instructions, so favor a library call when operating
32414 // on a scalar and minimizing code size.
32415 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
32418 SDValue Op0 = N->getOperand(0);
32419 SDValue Op1 = N->getOperand(1);
32421 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
32422 DAG.getDataLayout(), *DAG.getContext(), VT);
32424 // There are 4 possibilities involving NaN inputs, and these are the required
32428 // ----------------
32429 // Num | Max | Op0 |
32430 // Op0 ----------------
32431 // NaN | Op1 | NaN |
32432 // ----------------
32434 // The SSE FP max/min instructions were not designed for this case, but rather
32436 // Min = Op1 < Op0 ? Op1 : Op0
32437 // Max = Op1 > Op0 ? Op1 : Op0
32439 // So they always return Op0 if either input is a NaN. However, we can still
32440 // use those instructions for fmaxnum by selecting away a NaN input.
32442 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
32443 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
32444 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
32445 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
32447 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
32448 // are NaN, the NaN value of Op1 is the result.
32449 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
32450 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
32453 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
32454 TargetLowering::DAGCombinerInfo &DCI) {
32455 // BT ignores high bits in the bit index operand.
32456 SDValue Op1 = N->getOperand(1);
32457 if (Op1.hasOneUse()) {
32458 unsigned BitWidth = Op1.getValueSizeInBits();
32459 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
32460 APInt KnownZero, KnownOne;
32461 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32462 !DCI.isBeforeLegalizeOps());
32463 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32464 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
32465 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
32466 DCI.CommitTargetLoweringOpt(TLO);
32471 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
32472 const X86Subtarget &Subtarget) {
32473 EVT VT = N->getValueType(0);
32474 if (!VT.isVector())
32477 SDValue N0 = N->getOperand(0);
32478 SDValue N1 = N->getOperand(1);
32479 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
32482 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
32483 // both SSE and AVX2 since there is no sign-extended shift right
32484 // operation on a vector with 64-bit elements.
32485 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
32486 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
32487 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
32488 N0.getOpcode() == ISD::SIGN_EXTEND)) {
32489 SDValue N00 = N0.getOperand(0);
32491 // EXTLOAD has a better solution on AVX2,
32492 // it may be replaced with X86ISD::VSEXT node.
32493 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
32494 if (!ISD::isNormalLoad(N00.getNode()))
32497 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
32498 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
32500 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
32506 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
32507 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
32508 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
32509 /// opportunities to combine math ops, use an LEA, or use a complex addressing
32510 /// mode. This can eliminate extend, add, and shift instructions.
32511 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
32512 const X86Subtarget &Subtarget) {
32513 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
32514 Ext->getOpcode() != ISD::ZERO_EXTEND)
32517 // TODO: This should be valid for other integer types.
32518 EVT VT = Ext->getValueType(0);
32519 if (VT != MVT::i64)
32522 SDValue Add = Ext->getOperand(0);
32523 if (Add.getOpcode() != ISD::ADD)
32526 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
32527 bool NSW = Add->getFlags()->hasNoSignedWrap();
32528 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
32530 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
32532 if ((Sext && !NSW) || (!Sext && !NUW))
32535 // Having a constant operand to the 'add' ensures that we are not increasing
32536 // the instruction count because the constant is extended for free below.
32537 // A constant operand can also become the displacement field of an LEA.
32538 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
32542 // Don't make the 'add' bigger if there's no hope of combining it with some
32543 // other 'add' or 'shl' instruction.
32544 // TODO: It may be profitable to generate simpler LEA instructions in place
32545 // of single 'add' instructions, but the cost model for selecting an LEA
32546 // currently has a high threshold.
32547 bool HasLEAPotential = false;
32548 for (auto *User : Ext->uses()) {
32549 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
32550 HasLEAPotential = true;
32554 if (!HasLEAPotential)
32557 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
32558 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
32559 SDValue AddOp0 = Add.getOperand(0);
32560 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
32561 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
32563 // The wider add is guaranteed to not wrap because both operands are
32566 Flags.setNoSignedWrap(NSW);
32567 Flags.setNoUnsignedWrap(NUW);
32568 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
32571 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
32572 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
32573 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
32574 /// extends from AH (which we otherwise need to do contortions to access).
32575 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
32576 SDValue N0 = N->getOperand(0);
32577 auto OpcodeN = N->getOpcode();
32578 auto OpcodeN0 = N0.getOpcode();
32579 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
32580 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
32583 EVT VT = N->getValueType(0);
32584 EVT InVT = N0.getValueType();
32585 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
32588 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
32589 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
32590 : X86ISD::UDIVREM8_ZEXT_HREG;
32591 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
32593 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
32594 return R.getValue(1);
32597 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
32598 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
32599 /// with UNDEFs) of the input to vectors of the same size as the target type
32600 /// which then extends the lowest elements.
32601 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
32602 TargetLowering::DAGCombinerInfo &DCI,
32603 const X86Subtarget &Subtarget) {
32604 unsigned Opcode = N->getOpcode();
32605 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
32607 if (!DCI.isBeforeLegalizeOps())
32609 if (!Subtarget.hasSSE2())
32612 SDValue N0 = N->getOperand(0);
32613 EVT VT = N->getValueType(0);
32614 EVT SVT = VT.getScalarType();
32615 EVT InVT = N0.getValueType();
32616 EVT InSVT = InVT.getScalarType();
32618 // Input type must be a vector and we must be extending legal integer types.
32619 if (!VT.isVector())
32621 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
32623 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
32626 // On AVX2+ targets, if the input/output types are both legal then we will be
32627 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
32628 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
32629 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
32634 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
32635 EVT InVT = N.getValueType();
32636 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
32637 Size / InVT.getScalarSizeInBits());
32638 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
32639 DAG.getUNDEF(InVT));
32641 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
32644 // If target-size is less than 128-bits, extend to a type that would extend
32645 // to 128 bits, extend that and extract the original target vector.
32646 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
32647 unsigned Scale = 128 / VT.getSizeInBits();
32649 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
32650 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
32651 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
32652 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
32653 DAG.getIntPtrConstant(0, DL));
32656 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
32657 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
32658 // Also use this if we don't have SSE41 to allow the legalizer do its job.
32659 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
32660 (VT.is256BitVector() && Subtarget.hasInt256()) ||
32661 (VT.is512BitVector() && Subtarget.hasAVX512())) {
32662 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
32663 return Opcode == ISD::SIGN_EXTEND
32664 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
32665 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
32668 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
32669 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
32670 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
32671 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
32672 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
32674 SmallVector<SDValue, 8> Opnds;
32675 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
32676 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
32677 DAG.getIntPtrConstant(Offset, DL));
32678 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
32679 SrcVec = Opcode == ISD::SIGN_EXTEND
32680 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
32681 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
32682 Opnds.push_back(SrcVec);
32684 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
32687 // On pre-AVX2 targets, split into 128-bit nodes of
32688 // ISD::*_EXTEND_VECTOR_INREG.
32689 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
32690 return SplitAndExtendInReg(128);
32692 // On pre-AVX512 targets, split into 256-bit nodes of
32693 // ISD::*_EXTEND_VECTOR_INREG.
32694 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
32695 return SplitAndExtendInReg(256);
32700 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
32701 TargetLowering::DAGCombinerInfo &DCI,
32702 const X86Subtarget &Subtarget) {
32703 SDValue N0 = N->getOperand(0);
32704 EVT VT = N->getValueType(0);
32705 EVT InVT = N0.getValueType();
32708 if (SDValue DivRem8 = getDivRem8(N, DAG))
32711 if (!DCI.isBeforeLegalizeOps()) {
32712 if (InVT == MVT::i1) {
32713 SDValue Zero = DAG.getConstant(0, DL, VT);
32715 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
32716 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
32721 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32724 if (Subtarget.hasAVX() && VT.is256BitVector())
32725 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32728 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32734 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
32735 const X86Subtarget &Subtarget) {
32737 EVT VT = N->getValueType(0);
32739 // Let legalize expand this if it isn't a legal type yet.
32740 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32743 EVT ScalarVT = VT.getScalarType();
32744 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
32747 SDValue A = N->getOperand(0);
32748 SDValue B = N->getOperand(1);
32749 SDValue C = N->getOperand(2);
32751 auto invertIfNegative = [](SDValue &V) {
32752 if (SDValue NegVal = isFNEG(V.getNode())) {
32759 // Do not convert the passthru input of scalar intrinsics.
32760 // FIXME: We could allow negations of the lower element only.
32761 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
32762 bool NegB = invertIfNegative(B);
32763 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
32765 // Negative multiplication when NegA xor NegB
32766 bool NegMul = (NegA != NegB);
32768 unsigned NewOpcode;
32770 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
32772 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
32775 if (N->getOpcode() == X86ISD::FMADD_RND) {
32776 switch (NewOpcode) {
32777 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
32778 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
32779 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
32780 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
32782 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
32783 switch (NewOpcode) {
32784 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
32785 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
32786 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
32787 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
32789 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
32790 switch (NewOpcode) {
32791 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
32792 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
32793 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
32794 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
32797 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
32798 "Unexpected opcode!");
32799 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
32802 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
32805 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
32806 TargetLowering::DAGCombinerInfo &DCI,
32807 const X86Subtarget &Subtarget) {
32808 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
32809 // (and (i32 x86isd::setcc_carry), 1)
32810 // This eliminates the zext. This transformation is necessary because
32811 // ISD::SETCC is always legalized to i8.
32813 SDValue N0 = N->getOperand(0);
32814 EVT VT = N->getValueType(0);
32816 if (N0.getOpcode() == ISD::AND &&
32818 N0.getOperand(0).hasOneUse()) {
32819 SDValue N00 = N0.getOperand(0);
32820 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32821 if (!isOneConstant(N0.getOperand(1)))
32823 return DAG.getNode(ISD::AND, dl, VT,
32824 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
32825 N00.getOperand(0), N00.getOperand(1)),
32826 DAG.getConstant(1, dl, VT));
32830 if (N0.getOpcode() == ISD::TRUNCATE &&
32832 N0.getOperand(0).hasOneUse()) {
32833 SDValue N00 = N0.getOperand(0);
32834 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
32835 return DAG.getNode(ISD::AND, dl, VT,
32836 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
32837 N00.getOperand(0), N00.getOperand(1)),
32838 DAG.getConstant(1, dl, VT));
32842 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
32845 if (VT.is256BitVector())
32846 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
32849 if (SDValue DivRem8 = getDivRem8(N, DAG))
32852 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
32855 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
32861 /// Optimize x == -y --> x+y == 0
32862 /// x != -y --> x+y != 0
32863 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
32864 const X86Subtarget &Subtarget) {
32865 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
32866 SDValue LHS = N->getOperand(0);
32867 SDValue RHS = N->getOperand(1);
32868 EVT VT = N->getValueType(0);
32871 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
32872 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
32873 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
32874 LHS.getOperand(1));
32875 return DAG.getSetCC(DL, N->getValueType(0), addV,
32876 DAG.getConstant(0, DL, addV.getValueType()), CC);
32878 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
32879 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
32880 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
32881 RHS.getOperand(1));
32882 return DAG.getSetCC(DL, N->getValueType(0), addV,
32883 DAG.getConstant(0, DL, addV.getValueType()), CC);
32886 if (VT.getScalarType() == MVT::i1 &&
32887 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
32889 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
32890 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
32891 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
32893 if (!IsSEXT0 || !IsVZero1) {
32894 // Swap the operands and update the condition code.
32895 std::swap(LHS, RHS);
32896 CC = ISD::getSetCCSwappedOperands(CC);
32898 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
32899 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
32900 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
32903 if (IsSEXT0 && IsVZero1) {
32904 assert(VT == LHS.getOperand(0).getValueType() &&
32905 "Uexpected operand type");
32906 if (CC == ISD::SETGT)
32907 return DAG.getConstant(0, DL, VT);
32908 if (CC == ISD::SETLE)
32909 return DAG.getConstant(1, DL, VT);
32910 if (CC == ISD::SETEQ || CC == ISD::SETGE)
32911 return DAG.getNOT(DL, LHS.getOperand(0), VT);
32913 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
32914 "Unexpected condition code!");
32915 return LHS.getOperand(0);
32919 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
32920 // to avoid scalarization via legalization because v4i32 is not a legal type.
32921 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
32922 LHS.getValueType() == MVT::v4f32)
32923 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
32928 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
32930 // Gather and Scatter instructions use k-registers for masks. The type of
32931 // the masks is v*i1. So the mask will be truncated anyway.
32932 // The SIGN_EXTEND_INREG my be dropped.
32933 SDValue Mask = N->getOperand(2);
32934 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
32935 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
32936 NewOps[2] = Mask.getOperand(0);
32937 DAG.UpdateNodeOperands(N, NewOps);
32942 // Helper function of performSETCCCombine. It is to materialize "setb reg"
32943 // as "sbb reg,reg", since it can be extended without zext and produces
32944 // an all-ones bit which is more useful than 0/1 in some cases.
32945 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
32946 SelectionDAG &DAG, MVT VT) {
32948 return DAG.getNode(ISD::AND, DL, VT,
32949 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
32950 DAG.getConstant(X86::COND_B, DL, MVT::i8),
32952 DAG.getConstant(1, DL, VT));
32953 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
32954 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
32955 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
32956 DAG.getConstant(X86::COND_B, DL, MVT::i8),
32960 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
32961 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
32962 TargetLowering::DAGCombinerInfo &DCI,
32963 const X86Subtarget &Subtarget) {
32965 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
32966 SDValue EFLAGS = N->getOperand(1);
32968 if (CC == X86::COND_A) {
32969 // Try to convert COND_A into COND_B in an attempt to facilitate
32970 // materializing "setb reg".
32972 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
32973 // cannot take an immediate as its first operand.
32975 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
32976 EFLAGS.getValueType().isInteger() &&
32977 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
32978 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
32979 EFLAGS.getNode()->getVTList(),
32980 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
32981 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
32982 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
32986 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
32987 // a zext and produces an all-ones bit which is more useful than 0/1 in some
32989 if (CC == X86::COND_B)
32990 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
32992 // Try to simplify the EFLAGS and condition code operands.
32993 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
32994 return getSETCC(CC, Flags, DL, DAG);
32999 /// Optimize branch condition evaluation.
33000 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
33001 TargetLowering::DAGCombinerInfo &DCI,
33002 const X86Subtarget &Subtarget) {
33004 SDValue EFLAGS = N->getOperand(3);
33005 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
33007 // Try to simplify the EFLAGS and condition code operands.
33008 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
33009 // RAUW them under us.
33010 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
33011 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
33012 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
33013 N->getOperand(1), Cond, Flags);
33019 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
33020 SelectionDAG &DAG) {
33021 // Take advantage of vector comparisons producing 0 or -1 in each lane to
33022 // optimize away operation when it's from a constant.
33024 // The general transformation is:
33025 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
33026 // AND(VECTOR_CMP(x,y), constant2)
33027 // constant2 = UNARYOP(constant)
33029 // Early exit if this isn't a vector operation, the operand of the
33030 // unary operation isn't a bitwise AND, or if the sizes of the operations
33031 // aren't the same.
33032 EVT VT = N->getValueType(0);
33033 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
33034 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
33035 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
33038 // Now check that the other operand of the AND is a constant. We could
33039 // make the transformation for non-constant splats as well, but it's unclear
33040 // that would be a benefit as it would not eliminate any operations, just
33041 // perform one more step in scalar code before moving to the vector unit.
33042 if (BuildVectorSDNode *BV =
33043 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
33044 // Bail out if the vector isn't a constant.
33045 if (!BV->isConstant())
33048 // Everything checks out. Build up the new and improved node.
33050 EVT IntVT = BV->getValueType(0);
33051 // Create a new constant of the appropriate type for the transformed
33053 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
33054 // The AND node needs bitcasts to/from an integer vector type around it.
33055 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
33056 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
33057 N->getOperand(0)->getOperand(0), MaskConst);
33058 SDValue Res = DAG.getBitcast(VT, NewAnd);
33065 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
33066 const X86Subtarget &Subtarget) {
33067 SDValue Op0 = N->getOperand(0);
33068 EVT VT = N->getValueType(0);
33069 EVT InVT = Op0.getValueType();
33070 EVT InSVT = InVT.getScalarType();
33071 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33073 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
33074 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
33075 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
33077 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33078 InVT.getVectorNumElements());
33079 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
33081 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
33082 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
33084 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33087 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
33088 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
33089 // the optimization here.
33090 if (DAG.SignBitIsZero(Op0))
33091 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
33096 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
33097 const X86Subtarget &Subtarget) {
33098 // First try to optimize away the conversion entirely when it's
33099 // conditionally from a constant. Vectors only.
33100 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
33103 // Now move on to more general possibilities.
33104 SDValue Op0 = N->getOperand(0);
33105 EVT VT = N->getValueType(0);
33106 EVT InVT = Op0.getValueType();
33107 EVT InSVT = InVT.getScalarType();
33109 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
33110 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
33111 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
33112 if (InVT.isVector() &&
33113 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
33114 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
33116 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33117 InVT.getVectorNumElements());
33118 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
33119 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33122 // Without AVX512DQ we only support i64 to float scalar conversion. For both
33123 // vectors and scalars, see if we know that the upper bits are all the sign
33124 // bit, in which case we can truncate the input to i32 and convert from that.
33125 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
33126 unsigned BitWidth = InVT.getScalarSizeInBits();
33127 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
33128 if (NumSignBits >= (BitWidth - 31)) {
33129 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
33130 if (InVT.isVector())
33131 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
33132 InVT.getVectorNumElements());
33134 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
33135 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
33139 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
33140 // a 32-bit target where SSE doesn't support i64->FP operations.
33141 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
33142 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
33143 EVT LdVT = Ld->getValueType(0);
33145 // This transformation is not supported if the result type is f16 or f128.
33146 if (VT == MVT::f16 || VT == MVT::f128)
33149 if (!Ld->isVolatile() && !VT.isVector() &&
33150 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
33151 !Subtarget.is64Bit() && LdVT == MVT::i64) {
33152 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
33153 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
33154 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
33161 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
33162 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
33163 X86TargetLowering::DAGCombinerInfo &DCI) {
33164 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
33165 // the result is either zero or one (depending on the input carry bit).
33166 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
33167 if (X86::isZeroNode(N->getOperand(0)) &&
33168 X86::isZeroNode(N->getOperand(1)) &&
33169 // We don't have a good way to replace an EFLAGS use, so only do this when
33171 SDValue(N, 1).use_empty()) {
33173 EVT VT = N->getValueType(0);
33174 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
33175 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
33176 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
33177 DAG.getConstant(X86::COND_B, DL,
33180 DAG.getConstant(1, DL, VT));
33181 return DCI.CombineTo(N, Res1, CarryOut);
33187 /// fold (add Y, (sete X, 0)) -> adc 0, Y
33188 /// (add Y, (setne X, 0)) -> sbb -1, Y
33189 /// (sub (sete X, 0), Y) -> sbb 0, Y
33190 /// (sub (setne X, 0), Y) -> adc -1, Y
33191 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
33194 // Look through ZExts.
33195 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
33196 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
33199 SDValue SetCC = Ext.getOperand(0);
33200 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
33203 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
33204 if (CC != X86::COND_E && CC != X86::COND_NE)
33207 SDValue Cmp = SetCC.getOperand(1);
33208 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
33209 !X86::isZeroNode(Cmp.getOperand(1)) ||
33210 !Cmp.getOperand(0).getValueType().isInteger())
33213 SDValue CmpOp0 = Cmp.getOperand(0);
33214 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
33215 DAG.getConstant(1, DL, CmpOp0.getValueType()));
33217 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
33218 if (CC == X86::COND_NE)
33219 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
33220 DL, OtherVal.getValueType(), OtherVal,
33221 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
33223 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
33224 DL, OtherVal.getValueType(), OtherVal,
33225 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
33228 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
33229 const X86Subtarget &Subtarget) {
33231 EVT VT = N->getValueType(0);
33232 SDValue Op0 = N->getOperand(0);
33233 SDValue Op1 = N->getOperand(1);
33235 // TODO: There's nothing special about i32, any integer type above i16 should
33236 // work just as well.
33237 if (!VT.isVector() || !VT.isSimple() ||
33238 !(VT.getVectorElementType() == MVT::i32))
33241 unsigned RegSize = 128;
33242 if (Subtarget.hasBWI())
33244 else if (Subtarget.hasAVX2())
33247 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
33248 // TODO: We should be able to handle larger vectors by splitting them before
33249 // feeding them into several SADs, and then reducing over those.
33250 if (VT.getSizeInBits() / 4 > RegSize)
33253 // We know N is a reduction add, which means one of its operands is a phi.
33254 // To match SAD, we need the other operand to be a vector select.
33255 SDValue SelectOp, Phi;
33256 if (Op0.getOpcode() == ISD::VSELECT) {
33259 } else if (Op1.getOpcode() == ISD::VSELECT) {
33265 // Check whether we have an abs-diff pattern feeding into the select.
33266 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
33269 // SAD pattern detected. Now build a SAD instruction and an addition for
33270 // reduction. Note that the number of elements of the result of SAD is less
33271 // than the number of elements of its input. Therefore, we could only update
33272 // part of elements in the reduction vector.
33273 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
33275 // The output of PSADBW is a vector of i64.
33276 // We need to turn the vector of i64 into a vector of i32.
33277 // If the reduction vector is at least as wide as the psadbw result, just
33278 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
33280 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
33281 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
33282 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
33284 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
33286 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
33287 // Update part of elements of the reduction vector. This is done by first
33288 // extracting a sub-vector from it, updating this sub-vector, and inserting
33290 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
33291 DAG.getIntPtrConstant(0, DL));
33292 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
33293 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
33294 DAG.getIntPtrConstant(0, DL));
33296 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
33299 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
33300 const X86Subtarget &Subtarget) {
33301 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
33302 if (Flags->hasVectorReduction()) {
33303 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
33306 EVT VT = N->getValueType(0);
33307 SDValue Op0 = N->getOperand(0);
33308 SDValue Op1 = N->getOperand(1);
33310 // Try to synthesize horizontal adds from adds of shuffles.
33311 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33312 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33313 isHorizontalBinOp(Op0, Op1, true))
33314 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
33316 return OptimizeConditionalInDecrement(N, DAG);
33319 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
33320 const X86Subtarget &Subtarget) {
33321 SDValue Op0 = N->getOperand(0);
33322 SDValue Op1 = N->getOperand(1);
33324 // X86 can't encode an immediate LHS of a sub. See if we can push the
33325 // negation into a preceding instruction.
33326 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
33327 // If the RHS of the sub is a XOR with one use and a constant, invert the
33328 // immediate. Then add one to the LHS of the sub so we can turn
33329 // X-Y -> X+~Y+1, saving one register.
33330 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
33331 isa<ConstantSDNode>(Op1.getOperand(1))) {
33332 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
33333 EVT VT = Op0.getValueType();
33334 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
33336 DAG.getConstant(~XorC, SDLoc(Op1), VT));
33337 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
33338 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
33342 // Try to synthesize horizontal adds from adds of shuffles.
33343 EVT VT = N->getValueType(0);
33344 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33345 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33346 isHorizontalBinOp(Op0, Op1, true))
33347 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
33349 return OptimizeConditionalInDecrement(N, DAG);
33352 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
33353 TargetLowering::DAGCombinerInfo &DCI,
33354 const X86Subtarget &Subtarget) {
33356 unsigned Opcode = N->getOpcode();
33357 MVT VT = N->getSimpleValueType(0);
33358 MVT SVT = VT.getVectorElementType();
33359 SDValue Op = N->getOperand(0);
33360 MVT OpVT = Op.getSimpleValueType();
33361 MVT OpEltVT = OpVT.getVectorElementType();
33362 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
33364 // Perform any constant folding.
33365 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
33366 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
33367 unsigned NumDstElts = VT.getVectorNumElements();
33368 SmallBitVector Undefs(NumDstElts, false);
33369 SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
33370 for (unsigned i = 0; i != NumDstElts; ++i) {
33371 SDValue OpElt = Op.getOperand(i);
33372 if (OpElt.getOpcode() == ISD::UNDEF) {
33376 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
33377 Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
33378 : Cst.sextOrTrunc(SVT.getSizeInBits());
33380 return getConstVector(Vals, Undefs, VT, DAG, DL);
33383 // (vzext (bitcast (vzext (x)) -> (vzext x)
33384 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
33385 SDValue V = peekThroughBitcasts(Op);
33386 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
33387 MVT InnerVT = V.getSimpleValueType();
33388 MVT InnerEltVT = InnerVT.getVectorElementType();
33390 // If the element sizes match exactly, we can just do one larger vzext. This
33391 // is always an exact type match as vzext operates on integer types.
33392 if (OpEltVT == InnerEltVT) {
33393 assert(OpVT == InnerVT && "Types must match for vzext!");
33394 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
33397 // The only other way we can combine them is if only a single element of the
33398 // inner vzext is used in the input to the outer vzext.
33399 if (InnerEltVT.getSizeInBits() < InputBits)
33402 // In this case, the inner vzext is completely dead because we're going to
33403 // only look at bits inside of the low element. Just do the outer vzext on
33404 // a bitcast of the input to the inner.
33405 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
33408 // Check if we can bypass extracting and re-inserting an element of an input
33409 // vector. Essentially:
33410 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
33411 // TODO: Add X86ISD::VSEXT support
33412 if (Opcode == X86ISD::VZEXT &&
33413 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33414 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33415 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
33416 SDValue ExtractedV = V.getOperand(0);
33417 SDValue OrigV = ExtractedV.getOperand(0);
33418 if (isNullConstant(ExtractedV.getOperand(1))) {
33419 MVT OrigVT = OrigV.getSimpleValueType();
33420 // Extract a subvector if necessary...
33421 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
33422 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
33423 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
33424 OrigVT.getVectorNumElements() / Ratio);
33425 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
33426 DAG.getIntPtrConstant(0, DL));
33428 Op = DAG.getBitcast(OpVT, OrigV);
33429 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
33436 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
33437 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
33438 const X86Subtarget &Subtarget) {
33439 SDValue Chain = N->getOperand(0);
33440 SDValue LHS = N->getOperand(1);
33441 SDValue RHS = N->getOperand(2);
33442 MVT VT = RHS.getSimpleValueType();
33445 auto *C = dyn_cast<ConstantSDNode>(RHS);
33446 if (!C || C->getZExtValue() != 1)
33449 RHS = DAG.getConstant(-1, DL, VT);
33450 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33451 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
33452 DAG.getVTList(MVT::i32, MVT::Other),
33453 {Chain, LHS, RHS}, VT, MMO);
33456 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
33457 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
33458 SDValue Op0 = N->getOperand(0);
33459 SDValue Op1 = N->getOperand(1);
33461 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
33464 EVT VT = N->getValueType(0);
33467 return DAG.getNode(X86ISD::TESTM, DL, VT,
33468 Op0->getOperand(0), Op0->getOperand(1));
33471 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
33472 const X86Subtarget &Subtarget) {
33473 MVT VT = N->getSimpleValueType(0);
33476 if (N->getOperand(0) == N->getOperand(1)) {
33477 if (N->getOpcode() == X86ISD::PCMPEQ)
33478 return getOnesVector(VT, Subtarget, DAG, DL);
33479 if (N->getOpcode() == X86ISD::PCMPGT)
33480 return getZeroVector(VT, Subtarget, DAG, DL);
33487 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
33488 DAGCombinerInfo &DCI) const {
33489 SelectionDAG &DAG = DCI.DAG;
33490 switch (N->getOpcode()) {
33492 case ISD::EXTRACT_VECTOR_ELT:
33493 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
33496 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
33497 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
33498 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
33499 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
33500 case ISD::SUB: return combineSub(N, DAG, Subtarget);
33501 case X86ISD::ADC: return combineADC(N, DAG, DCI);
33502 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
33505 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
33506 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
33507 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
33508 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
33509 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
33510 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
33511 case ISD::STORE: return combineStore(N, DAG, Subtarget);
33512 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
33513 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
33514 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
33516 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
33517 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
33518 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
33519 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
33520 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
33522 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
33524 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
33526 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
33527 case X86ISD::BT: return combineBT(N, DAG, DCI);
33528 case ISD::ANY_EXTEND:
33529 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
33530 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
33531 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
33532 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
33533 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
33534 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
33535 case X86ISD::VSHLI:
33536 case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
33537 case X86ISD::VSEXT:
33538 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
33539 case X86ISD::SHUFP: // Handle all target specific shuffles
33540 case X86ISD::INSERTPS:
33541 case X86ISD::PALIGNR:
33542 case X86ISD::VSHLDQ:
33543 case X86ISD::VSRLDQ:
33544 case X86ISD::BLENDI:
33545 case X86ISD::UNPCKH:
33546 case X86ISD::UNPCKL:
33547 case X86ISD::MOVHLPS:
33548 case X86ISD::MOVLHPS:
33549 case X86ISD::PSHUFB:
33550 case X86ISD::PSHUFD:
33551 case X86ISD::PSHUFHW:
33552 case X86ISD::PSHUFLW:
33553 case X86ISD::MOVSHDUP:
33554 case X86ISD::MOVSLDUP:
33555 case X86ISD::MOVDDUP:
33556 case X86ISD::MOVSS:
33557 case X86ISD::MOVSD:
33558 case X86ISD::VPPERM:
33559 case X86ISD::VPERMI:
33560 case X86ISD::VPERMV:
33561 case X86ISD::VPERMV3:
33562 case X86ISD::VPERMIV3:
33563 case X86ISD::VPERMIL2:
33564 case X86ISD::VPERMILPI:
33565 case X86ISD::VPERMILPV:
33566 case X86ISD::VPERM2X128:
33567 case X86ISD::VZEXT_MOVL:
33568 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
33569 case X86ISD::FMADD:
33570 case X86ISD::FMADD_RND:
33571 case X86ISD::FMADDS1_RND:
33572 case X86ISD::FMADDS3_RND:
33573 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
33575 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
33576 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
33577 case X86ISD::TESTM: return combineTestM(N, DAG);
33578 case X86ISD::PCMPEQ:
33579 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
33585 /// Return true if the target has native support for the specified value type
33586 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
33587 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
33588 /// some i16 instructions are slow.
33589 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
33590 if (!isTypeLegal(VT))
33592 if (VT != MVT::i16)
33599 case ISD::SIGN_EXTEND:
33600 case ISD::ZERO_EXTEND:
33601 case ISD::ANY_EXTEND:
33614 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
33615 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
33616 /// we don't adjust the stack we clobber the first frame index.
33617 /// See X86InstrInfo::copyPhysReg.
33618 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
33619 MachineFunction *MF) const {
33620 const MachineRegisterInfo &MRI = MF->getRegInfo();
33622 return any_of(MRI.reg_instructions(X86::EFLAGS),
33623 [](const MachineInstr &RI) { return RI.isCopy(); });
33626 /// This method query the target whether it is beneficial for dag combiner to
33627 /// promote the specified node. If true, it should return the desired promotion
33628 /// type by reference.
33629 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
33630 EVT VT = Op.getValueType();
33631 if (VT != MVT::i16)
33634 bool Promote = false;
33635 bool Commute = false;
33636 switch (Op.getOpcode()) {
33638 case ISD::SIGN_EXTEND:
33639 case ISD::ZERO_EXTEND:
33640 case ISD::ANY_EXTEND:
33645 SDValue N0 = Op.getOperand(0);
33646 // Look out for (store (shl (load), x)).
33647 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
33660 SDValue N0 = Op.getOperand(0);
33661 SDValue N1 = Op.getOperand(1);
33662 if (!Commute && MayFoldLoad(N1))
33664 // Avoid disabling potential load folding opportunities.
33665 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
33667 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
33677 //===----------------------------------------------------------------------===//
33678 // X86 Inline Assembly Support
33679 //===----------------------------------------------------------------------===//
33681 // Helper to match a string separated by whitespace.
33682 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
33683 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
33685 for (StringRef Piece : Pieces) {
33686 if (!S.startswith(Piece)) // Check if the piece matches.
33689 S = S.substr(Piece.size());
33690 StringRef::size_type Pos = S.find_first_not_of(" \t");
33691 if (Pos == 0) // We matched a prefix.
33700 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
33702 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
33703 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
33704 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
33705 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
33707 if (AsmPieces.size() == 3)
33709 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
33716 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
33717 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
33719 const std::string &AsmStr = IA->getAsmString();
33721 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
33722 if (!Ty || Ty->getBitWidth() % 16 != 0)
33725 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
33726 SmallVector<StringRef, 4> AsmPieces;
33727 SplitString(AsmStr, AsmPieces, ";\n");
33729 switch (AsmPieces.size()) {
33730 default: return false;
33732 // FIXME: this should verify that we are targeting a 486 or better. If not,
33733 // we will turn this bswap into something that will be lowered to logical
33734 // ops instead of emitting the bswap asm. For now, we don't support 486 or
33735 // lower so don't worry about this.
33737 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
33738 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
33739 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
33740 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
33741 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
33742 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
33743 // No need to check constraints, nothing other than the equivalent of
33744 // "=r,0" would be valid here.
33745 return IntrinsicLowering::LowerToByteSwap(CI);
33748 // rorw $$8, ${0:w} --> llvm.bswap.i16
33749 if (CI->getType()->isIntegerTy(16) &&
33750 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33751 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
33752 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
33754 StringRef ConstraintsStr = IA->getConstraintString();
33755 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33756 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33757 if (clobbersFlagRegisters(AsmPieces))
33758 return IntrinsicLowering::LowerToByteSwap(CI);
33762 if (CI->getType()->isIntegerTy(32) &&
33763 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
33764 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
33765 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
33766 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
33768 StringRef ConstraintsStr = IA->getConstraintString();
33769 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
33770 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
33771 if (clobbersFlagRegisters(AsmPieces))
33772 return IntrinsicLowering::LowerToByteSwap(CI);
33775 if (CI->getType()->isIntegerTy(64)) {
33776 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
33777 if (Constraints.size() >= 2 &&
33778 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
33779 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
33780 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
33781 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
33782 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
33783 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
33784 return IntrinsicLowering::LowerToByteSwap(CI);
33792 /// Given a constraint letter, return the type of constraint for this target.
33793 X86TargetLowering::ConstraintType
33794 X86TargetLowering::getConstraintType(StringRef Constraint) const {
33795 if (Constraint.size() == 1) {
33796 switch (Constraint[0]) {
33808 return C_RegisterClass;
33809 case 'k': // AVX512 masking registers.
33833 else if (Constraint.size() == 2) {
33834 switch (Constraint[0]) {
33838 switch (Constraint[1]) {
33846 return TargetLowering::getConstraintType(Constraint);
33849 /// Examine constraint type and operand type and determine a weight value.
33850 /// This object must already have been set up with the operand type
33851 /// and the current alternative constraint selected.
33852 TargetLowering::ConstraintWeight
33853 X86TargetLowering::getSingleConstraintMatchWeight(
33854 AsmOperandInfo &info, const char *constraint) const {
33855 ConstraintWeight weight = CW_Invalid;
33856 Value *CallOperandVal = info.CallOperandVal;
33857 // If we don't have a value, we can't do a match,
33858 // but allow it at the lowest weight.
33859 if (!CallOperandVal)
33861 Type *type = CallOperandVal->getType();
33862 // Look at the constraint type.
33863 switch (*constraint) {
33865 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
33876 if (CallOperandVal->getType()->isIntegerTy())
33877 weight = CW_SpecificReg;
33882 if (type->isFloatingPointTy())
33883 weight = CW_SpecificReg;
33886 if (type->isX86_MMXTy() && Subtarget.hasMMX())
33887 weight = CW_SpecificReg;
33890 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
33891 if (constraint[1] == 'k') {
33892 // Support for 'Yk' (similarly to the 'k' variant below).
33893 weight = CW_SpecificReg;
33896 // Else fall through (handle "Y" constraint).
33899 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
33900 weight = CW_Register;
33903 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
33904 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
33905 weight = CW_Register;
33908 // Enable conditional vector operations using %k<#> registers.
33909 weight = CW_SpecificReg;
33912 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
33913 if (C->getZExtValue() <= 31)
33914 weight = CW_Constant;
33918 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33919 if (C->getZExtValue() <= 63)
33920 weight = CW_Constant;
33924 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33925 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
33926 weight = CW_Constant;
33930 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33931 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
33932 weight = CW_Constant;
33936 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33937 if (C->getZExtValue() <= 3)
33938 weight = CW_Constant;
33942 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33943 if (C->getZExtValue() <= 0xff)
33944 weight = CW_Constant;
33949 if (isa<ConstantFP>(CallOperandVal)) {
33950 weight = CW_Constant;
33954 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33955 if ((C->getSExtValue() >= -0x80000000LL) &&
33956 (C->getSExtValue() <= 0x7fffffffLL))
33957 weight = CW_Constant;
33961 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
33962 if (C->getZExtValue() <= 0xffffffff)
33963 weight = CW_Constant;
33970 /// Try to replace an X constraint, which matches anything, with another that
33971 /// has more specific requirements based on the type of the corresponding
33973 const char *X86TargetLowering::
33974 LowerXConstraint(EVT ConstraintVT) const {
33975 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
33976 // 'f' like normal targets.
33977 if (ConstraintVT.isFloatingPoint()) {
33978 if (Subtarget.hasSSE2())
33980 if (Subtarget.hasSSE1())
33984 return TargetLowering::LowerXConstraint(ConstraintVT);
33987 /// Lower the specified operand into the Ops vector.
33988 /// If it is invalid, don't add anything to Ops.
33989 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
33990 std::string &Constraint,
33991 std::vector<SDValue>&Ops,
33992 SelectionDAG &DAG) const {
33995 // Only support length 1 constraints for now.
33996 if (Constraint.length() > 1) return;
33998 char ConstraintLetter = Constraint[0];
33999 switch (ConstraintLetter) {
34002 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34003 if (C->getZExtValue() <= 31) {
34004 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34005 Op.getValueType());
34011 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34012 if (C->getZExtValue() <= 63) {
34013 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34014 Op.getValueType());
34020 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34021 if (isInt<8>(C->getSExtValue())) {
34022 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34023 Op.getValueType());
34029 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34030 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
34031 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
34032 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
34033 Op.getValueType());
34039 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34040 if (C->getZExtValue() <= 3) {
34041 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34042 Op.getValueType());
34048 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34049 if (C->getZExtValue() <= 255) {
34050 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34051 Op.getValueType());
34057 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34058 if (C->getZExtValue() <= 127) {
34059 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34060 Op.getValueType());
34066 // 32-bit signed value
34067 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34068 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34069 C->getSExtValue())) {
34070 // Widen to 64 bits here to get it sign extended.
34071 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
34074 // FIXME gcc accepts some relocatable values here too, but only in certain
34075 // memory models; it's complicated.
34080 // 32-bit unsigned value
34081 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34082 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34083 C->getZExtValue())) {
34084 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34085 Op.getValueType());
34089 // FIXME gcc accepts some relocatable values here too, but only in certain
34090 // memory models; it's complicated.
34094 // Literal immediates are always ok.
34095 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
34096 // Widen to 64 bits here to get it sign extended.
34097 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
34101 // In any sort of PIC mode addresses need to be computed at runtime by
34102 // adding in a register or some sort of table lookup. These can't
34103 // be used as immediates.
34104 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
34107 // If we are in non-pic codegen mode, we allow the address of a global (with
34108 // an optional displacement) to be used with 'i'.
34109 GlobalAddressSDNode *GA = nullptr;
34110 int64_t Offset = 0;
34112 // Match either (GA), (GA+C), (GA+C1+C2), etc.
34114 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
34115 Offset += GA->getOffset();
34117 } else if (Op.getOpcode() == ISD::ADD) {
34118 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34119 Offset += C->getZExtValue();
34120 Op = Op.getOperand(0);
34123 } else if (Op.getOpcode() == ISD::SUB) {
34124 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34125 Offset += -C->getZExtValue();
34126 Op = Op.getOperand(0);
34131 // Otherwise, this isn't something we can handle, reject it.
34135 const GlobalValue *GV = GA->getGlobal();
34136 // If we require an extra load to get this address, as in PIC mode, we
34137 // can't accept it.
34138 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
34141 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
34142 GA->getValueType(0), Offset);
34147 if (Result.getNode()) {
34148 Ops.push_back(Result);
34151 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
34154 /// Check if \p RC is a general purpose register class.
34155 /// I.e., GR* or one of their variant.
34156 static bool isGRClass(const TargetRegisterClass &RC) {
34157 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
34158 RC.hasSuperClassEq(&X86::GR16RegClass) ||
34159 RC.hasSuperClassEq(&X86::GR32RegClass) ||
34160 RC.hasSuperClassEq(&X86::GR64RegClass) ||
34161 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
34164 /// Check if \p RC is a vector register class.
34165 /// I.e., FR* / VR* or one of their variant.
34166 static bool isFRClass(const TargetRegisterClass &RC) {
34167 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
34168 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
34169 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
34170 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
34171 RC.hasSuperClassEq(&X86::VR512RegClass);
34174 std::pair<unsigned, const TargetRegisterClass *>
34175 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
34176 StringRef Constraint,
34178 // First, see if this is a constraint that directly corresponds to an LLVM
34180 if (Constraint.size() == 1) {
34181 // GCC Constraint Letters
34182 switch (Constraint[0]) {
34184 // TODO: Slight differences here in allocation order and leaving
34185 // RIP in the class. Do they matter any more here than they do
34186 // in the normal allocation?
34188 if (Subtarget.hasAVX512()) {
34189 // Only supported in AVX512 or later.
34190 switch (VT.SimpleTy) {
34193 return std::make_pair(0U, &X86::VK32RegClass);
34195 return std::make_pair(0U, &X86::VK16RegClass);
34197 return std::make_pair(0U, &X86::VK8RegClass);
34199 return std::make_pair(0U, &X86::VK1RegClass);
34201 return std::make_pair(0U, &X86::VK64RegClass);
34205 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
34206 if (Subtarget.is64Bit()) {
34207 if (VT == MVT::i32 || VT == MVT::f32)
34208 return std::make_pair(0U, &X86::GR32RegClass);
34209 if (VT == MVT::i16)
34210 return std::make_pair(0U, &X86::GR16RegClass);
34211 if (VT == MVT::i8 || VT == MVT::i1)
34212 return std::make_pair(0U, &X86::GR8RegClass);
34213 if (VT == MVT::i64 || VT == MVT::f64)
34214 return std::make_pair(0U, &X86::GR64RegClass);
34217 // 32-bit fallthrough
34218 case 'Q': // Q_REGS
34219 if (VT == MVT::i32 || VT == MVT::f32)
34220 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
34221 if (VT == MVT::i16)
34222 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
34223 if (VT == MVT::i8 || VT == MVT::i1)
34224 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
34225 if (VT == MVT::i64)
34226 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
34228 case 'r': // GENERAL_REGS
34229 case 'l': // INDEX_REGS
34230 if (VT == MVT::i8 || VT == MVT::i1)
34231 return std::make_pair(0U, &X86::GR8RegClass);
34232 if (VT == MVT::i16)
34233 return std::make_pair(0U, &X86::GR16RegClass);
34234 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
34235 return std::make_pair(0U, &X86::GR32RegClass);
34236 return std::make_pair(0U, &X86::GR64RegClass);
34237 case 'R': // LEGACY_REGS
34238 if (VT == MVT::i8 || VT == MVT::i1)
34239 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
34240 if (VT == MVT::i16)
34241 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
34242 if (VT == MVT::i32 || !Subtarget.is64Bit())
34243 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
34244 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
34245 case 'f': // FP Stack registers.
34246 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
34247 // value to the correct fpstack register class.
34248 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
34249 return std::make_pair(0U, &X86::RFP32RegClass);
34250 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
34251 return std::make_pair(0U, &X86::RFP64RegClass);
34252 return std::make_pair(0U, &X86::RFP80RegClass);
34253 case 'y': // MMX_REGS if MMX allowed.
34254 if (!Subtarget.hasMMX()) break;
34255 return std::make_pair(0U, &X86::VR64RegClass);
34256 case 'Y': // SSE_REGS if SSE2 allowed
34257 if (!Subtarget.hasSSE2()) break;
34260 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
34261 if (!Subtarget.hasSSE1()) break;
34262 bool VConstraint = (Constraint[0] == 'v');
34264 switch (VT.SimpleTy) {
34266 // Scalar SSE types.
34269 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
34270 return std::make_pair(0U, &X86::FR32XRegClass);
34271 return std::make_pair(0U, &X86::FR32RegClass);
34274 if (VConstraint && Subtarget.hasVLX())
34275 return std::make_pair(0U, &X86::FR64XRegClass);
34276 return std::make_pair(0U, &X86::FR64RegClass);
34277 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34285 if (VConstraint && Subtarget.hasVLX())
34286 return std::make_pair(0U, &X86::VR128XRegClass);
34287 return std::make_pair(0U, &X86::VR128RegClass);
34295 if (VConstraint && Subtarget.hasVLX())
34296 return std::make_pair(0U, &X86::VR256XRegClass);
34297 return std::make_pair(0U, &X86::VR256RegClass);
34302 return std::make_pair(0U, &X86::VR512RegClass);
34306 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
34307 switch (Constraint[1]) {
34311 // This register class doesn't allocate k0 for masked vector operation.
34312 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
34313 switch (VT.SimpleTy) {
34316 return std::make_pair(0U, &X86::VK32WMRegClass);
34318 return std::make_pair(0U, &X86::VK16WMRegClass);
34320 return std::make_pair(0U, &X86::VK8WMRegClass);
34322 return std::make_pair(0U, &X86::VK1WMRegClass);
34324 return std::make_pair(0U, &X86::VK64WMRegClass);
34331 // Use the default implementation in TargetLowering to convert the register
34332 // constraint into a member of a register class.
34333 std::pair<unsigned, const TargetRegisterClass*> Res;
34334 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
34336 // Not found as a standard register?
34338 // Map st(0) -> st(7) -> ST0
34339 if (Constraint.size() == 7 && Constraint[0] == '{' &&
34340 tolower(Constraint[1]) == 's' &&
34341 tolower(Constraint[2]) == 't' &&
34342 Constraint[3] == '(' &&
34343 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
34344 Constraint[5] == ')' &&
34345 Constraint[6] == '}') {
34347 Res.first = X86::FP0+Constraint[4]-'0';
34348 Res.second = &X86::RFP80RegClass;
34352 // GCC allows "st(0)" to be called just plain "st".
34353 if (StringRef("{st}").equals_lower(Constraint)) {
34354 Res.first = X86::FP0;
34355 Res.second = &X86::RFP80RegClass;
34360 if (StringRef("{flags}").equals_lower(Constraint)) {
34361 Res.first = X86::EFLAGS;
34362 Res.second = &X86::CCRRegClass;
34366 // 'A' means EAX + EDX.
34367 if (Constraint == "A") {
34368 Res.first = X86::EAX;
34369 Res.second = &X86::GR32_ADRegClass;
34375 // Otherwise, check to see if this is a register class of the wrong value
34376 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
34377 // turn into {ax},{dx}.
34378 // MVT::Other is used to specify clobber names.
34379 if (Res.second->hasType(VT) || VT == MVT::Other)
34380 return Res; // Correct type already, nothing to do.
34382 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
34383 // return "eax". This should even work for things like getting 64bit integer
34384 // registers when given an f64 type.
34385 const TargetRegisterClass *Class = Res.second;
34386 // The generic code will match the first register class that contains the
34387 // given register. Thus, based on the ordering of the tablegened file,
34388 // the "plain" GR classes might not come first.
34389 // Therefore, use a helper method.
34390 if (isGRClass(*Class)) {
34391 unsigned Size = VT.getSizeInBits();
34392 if (Size == 1) Size = 8;
34393 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
34395 Res.first = DestReg;
34396 Res.second = Size == 8 ? &X86::GR8RegClass
34397 : Size == 16 ? &X86::GR16RegClass
34398 : Size == 32 ? &X86::GR32RegClass
34399 : &X86::GR64RegClass;
34400 assert(Res.second->contains(Res.first) && "Register in register class");
34402 // No register found/type mismatch.
34404 Res.second = nullptr;
34406 } else if (isFRClass(*Class)) {
34407 // Handle references to XMM physical registers that got mapped into the
34408 // wrong class. This can happen with constraints like {xmm0} where the
34409 // target independent register mapper will just pick the first match it can
34410 // find, ignoring the required type.
34412 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34413 if (VT == MVT::f32 || VT == MVT::i32)
34414 Res.second = &X86::FR32RegClass;
34415 else if (VT == MVT::f64 || VT == MVT::i64)
34416 Res.second = &X86::FR64RegClass;
34417 else if (X86::VR128RegClass.hasType(VT))
34418 Res.second = &X86::VR128RegClass;
34419 else if (X86::VR256RegClass.hasType(VT))
34420 Res.second = &X86::VR256RegClass;
34421 else if (X86::VR512RegClass.hasType(VT))
34422 Res.second = &X86::VR512RegClass;
34424 // Type mismatch and not a clobber: Return an error;
34426 Res.second = nullptr;
34433 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
34434 const AddrMode &AM, Type *Ty,
34435 unsigned AS) const {
34436 // Scaling factors are not free at all.
34437 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
34438 // will take 2 allocations in the out of order engine instead of 1
34439 // for plain addressing mode, i.e. inst (reg1).
34441 // vaddps (%rsi,%drx), %ymm0, %ymm1
34442 // Requires two allocations (one for the load, one for the computation)
34444 // vaddps (%rsi), %ymm0, %ymm1
34445 // Requires just 1 allocation, i.e., freeing allocations for other operations
34446 // and having less micro operations to execute.
34448 // For some X86 architectures, this is even worse because for instance for
34449 // stores, the complex addressing mode forces the instruction to use the
34450 // "load" ports instead of the dedicated "store" port.
34451 // E.g., on Haswell:
34452 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
34453 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
34454 if (isLegalAddressingMode(DL, AM, Ty, AS))
34455 // Scale represents reg2 * scale, thus account for 1
34456 // as soon as we use a second register.
34457 return AM.Scale != 0;
34461 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
34462 // Integer division on x86 is expensive. However, when aggressively optimizing
34463 // for code size, we prefer to use a div instruction, as it is usually smaller
34464 // than the alternative sequence.
34465 // The exception to this is vector division. Since x86 doesn't have vector
34466 // integer division, leaving the division as-is is a loss even in terms of
34467 // size, because it will have to be scalarized, while the alternative code
34468 // sequence can be performed in vector form.
34469 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
34470 Attribute::MinSize);
34471 return OptSize && !VT.isVector();
34474 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
34475 if (!Subtarget.is64Bit())
34478 // Update IsSplitCSR in X86MachineFunctionInfo.
34479 X86MachineFunctionInfo *AFI =
34480 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
34481 AFI->setIsSplitCSR(true);
34484 void X86TargetLowering::insertCopiesSplitCSR(
34485 MachineBasicBlock *Entry,
34486 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
34487 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34488 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
34492 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34493 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
34494 MachineBasicBlock::iterator MBBI = Entry->begin();
34495 for (const MCPhysReg *I = IStart; *I; ++I) {
34496 const TargetRegisterClass *RC = nullptr;
34497 if (X86::GR64RegClass.contains(*I))
34498 RC = &X86::GR64RegClass;
34500 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
34502 unsigned NewVR = MRI->createVirtualRegister(RC);
34503 // Create copy from CSR to a virtual register.
34504 // FIXME: this currently does not emit CFI pseudo-instructions, it works
34505 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
34506 // nounwind. If we want to generalize this later, we may need to emit
34507 // CFI pseudo-instructions.
34508 assert(Entry->getParent()->getFunction()->hasFnAttribute(
34509 Attribute::NoUnwind) &&
34510 "Function should be nounwind in insertCopiesSplitCSR!");
34511 Entry->addLiveIn(*I);
34512 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
34515 // Insert the copy-back instructions right before the terminator.
34516 for (auto *Exit : Exits)
34517 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
34518 TII->get(TargetOpcode::COPY), *I)
34523 bool X86TargetLowering::supportSwiftError() const {
34524 return Subtarget.is64Bit();