1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/Function.h"
44 #include "llvm/IR/GlobalAlias.h"
45 #include "llvm/IR/GlobalVariable.h"
46 #include "llvm/IR/Instructions.h"
47 #include "llvm/IR/Intrinsics.h"
48 #include "llvm/MC/MCAsmInfo.h"
49 #include "llvm/MC/MCContext.h"
50 #include "llvm/MC/MCExpr.h"
51 #include "llvm/MC/MCSymbol.h"
52 #include "llvm/Support/CommandLine.h"
53 #include "llvm/Support/Debug.h"
54 #include "llvm/Support/ErrorHandling.h"
55 #include "llvm/Support/MathExtras.h"
56 #include "llvm/Target/TargetOptions.h"
63 #define DEBUG_TYPE "x86-isel"
65 STATISTIC(NumTailCalls, "Number of tail calls");
67 static cl::opt<bool> ExperimentalVectorWideningLegalization(
68 "x86-experimental-vector-widening-legalization", cl::init(false),
69 cl::desc("Enable an experimental vector type legalization through widening "
70 "rather than promotion."),
73 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
74 const X86Subtarget &STI)
75 : TargetLowering(TM), Subtarget(STI) {
76 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
77 X86ScalarSSEf64 = Subtarget.hasSSE2();
78 X86ScalarSSEf32 = Subtarget.hasSSE1();
79 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
81 // Set up the TargetLowering object.
83 // X86 is weird. It always uses i8 for shift amounts and setcc results.
84 setBooleanContents(ZeroOrOneBooleanContent);
85 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
86 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
88 // For 64-bit, since we have so many registers, use the ILP scheduler.
89 // For 32-bit, use the register pressure specific scheduling.
90 // For Atom, always use ILP scheduling.
91 if (Subtarget.isAtom())
92 setSchedulingPreference(Sched::ILP);
93 else if (Subtarget.is64Bit())
94 setSchedulingPreference(Sched::ILP);
96 setSchedulingPreference(Sched::RegPressure);
97 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
98 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
100 // Bypass expensive divides and use cheaper ones.
101 if (TM.getOptLevel() >= CodeGenOpt::Default) {
102 if (Subtarget.hasSlowDivide32())
103 addBypassSlowDiv(32, 8);
104 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
105 addBypassSlowDiv(64, 32);
108 if (Subtarget.isTargetKnownWindowsMSVC() ||
109 Subtarget.isTargetWindowsItanium()) {
110 // Setup Windows compiler runtime calls.
111 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
112 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
113 setLibcallName(RTLIB::SREM_I64, "_allrem");
114 setLibcallName(RTLIB::UREM_I64, "_aullrem");
115 setLibcallName(RTLIB::MUL_I64, "_allmul");
116 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
117 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
118 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
119 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
120 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
123 if (Subtarget.isTargetDarwin()) {
124 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
125 setUseUnderscoreSetJmp(false);
126 setUseUnderscoreLongJmp(false);
127 } else if (Subtarget.isTargetWindowsGNU()) {
128 // MS runtime is weird: it exports _setjmp, but longjmp!
129 setUseUnderscoreSetJmp(true);
130 setUseUnderscoreLongJmp(false);
132 setUseUnderscoreSetJmp(true);
133 setUseUnderscoreLongJmp(true);
136 // Set up the register classes.
137 addRegisterClass(MVT::i8, &X86::GR8RegClass);
138 addRegisterClass(MVT::i16, &X86::GR16RegClass);
139 addRegisterClass(MVT::i32, &X86::GR32RegClass);
140 if (Subtarget.is64Bit())
141 addRegisterClass(MVT::i64, &X86::GR64RegClass);
143 for (MVT VT : MVT::integer_valuetypes())
144 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
146 // We don't accept any truncstore of integer registers.
147 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
148 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
149 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
150 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
151 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
152 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
154 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
156 // SETOEQ and SETUNE require checking two conditions.
157 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
158 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
159 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
160 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
161 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
162 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
164 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
166 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
167 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
168 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
170 if (Subtarget.is64Bit()) {
171 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
172 // f32/f64 are legal, f80 is custom.
173 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
175 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
176 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
177 } else if (!Subtarget.useSoftFloat()) {
178 // We have an algorithm for SSE2->double, and we turn this into a
179 // 64-bit FILD followed by conditional FADD for other targets.
180 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
181 // We have an algorithm for SSE2, and we turn this into a 64-bit
182 // FILD or VCVTUSI2SS/SD for other targets.
183 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
186 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
188 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
189 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
191 if (!Subtarget.useSoftFloat()) {
192 // SSE has no i16 to fp conversion, only i32.
193 if (X86ScalarSSEf32) {
194 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
195 // f32 and f64 cases are Legal, f80 case is not
196 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
198 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
199 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
203 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
206 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
208 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
209 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
211 if (!Subtarget.useSoftFloat()) {
212 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
213 // are Legal, f80 is custom lowered.
214 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
215 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
217 if (X86ScalarSSEf32) {
218 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
219 // f32 and f64 cases are Legal, f80 case is not
220 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
222 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
223 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
226 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
227 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
228 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
231 // Handle FP_TO_UINT by promoting the destination to a larger signed
233 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
234 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
235 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
237 if (Subtarget.is64Bit()) {
238 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
239 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
240 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
241 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
243 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
244 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
246 } else if (!Subtarget.useSoftFloat()) {
247 // Since AVX is a superset of SSE3, only check for SSE here.
248 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
249 // Expand FP_TO_UINT into a select.
250 // FIXME: We would like to use a Custom expander here eventually to do
251 // the optimal thing for SSE vs. the default expansion in the legalizer.
252 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
254 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
255 // With SSE3 we can use fisttpll to convert to a signed i64; without
256 // SSE, we're stuck with a fistpll.
257 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
259 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
262 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
263 if (!X86ScalarSSEf64) {
264 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
265 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
266 if (Subtarget.is64Bit()) {
267 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
268 // Without SSE, i64->f64 goes through memory.
269 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
271 } else if (!Subtarget.is64Bit())
272 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
274 // Scalar integer divide and remainder are lowered to use operations that
275 // produce two results, to match the available instructions. This exposes
276 // the two-result form to trivial CSE, which is able to combine x/y and x%y
277 // into a single instruction.
279 // Scalar integer multiply-high is also lowered to use two-result
280 // operations, to match the available instructions. However, plain multiply
281 // (low) operations are left as Legal, as there are single-result
282 // instructions for this in x86. Using the two-result multiply instructions
283 // when both high and low results are needed must be arranged by dagcombine.
284 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
285 setOperationAction(ISD::MULHS, VT, Expand);
286 setOperationAction(ISD::MULHU, VT, Expand);
287 setOperationAction(ISD::SDIV, VT, Expand);
288 setOperationAction(ISD::UDIV, VT, Expand);
289 setOperationAction(ISD::SREM, VT, Expand);
290 setOperationAction(ISD::UREM, VT, Expand);
293 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
294 if (VT == MVT::i64 && !Subtarget.is64Bit())
296 // Add/Sub overflow ops with MVT::Glues are lowered to EFLAGS dependences.
297 setOperationAction(ISD::ADDC, VT, Custom);
298 setOperationAction(ISD::ADDE, VT, Custom);
299 setOperationAction(ISD::SUBC, VT, Custom);
300 setOperationAction(ISD::SUBE, VT, Custom);
303 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
304 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
305 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
306 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
307 setOperationAction(ISD::BR_CC, VT, Expand);
308 setOperationAction(ISD::SELECT_CC, VT, Expand);
310 if (Subtarget.is64Bit())
311 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
312 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
313 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
314 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
315 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
317 setOperationAction(ISD::FREM , MVT::f32 , Expand);
318 setOperationAction(ISD::FREM , MVT::f64 , Expand);
319 setOperationAction(ISD::FREM , MVT::f80 , Expand);
320 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
322 // Promote the i8 variants and force them on up to i32 which has a shorter
324 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
325 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
326 if (!Subtarget.hasBMI()) {
327 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
328 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
329 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
330 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
331 if (Subtarget.is64Bit()) {
332 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
333 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
337 if (Subtarget.hasLZCNT()) {
338 // When promoting the i8 variants, force them to i32 for a shorter
340 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
341 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
344 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
347 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
348 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
349 if (Subtarget.is64Bit()) {
350 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
351 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
355 // Special handling for half-precision floating point conversions.
356 // If we don't have F16C support, then lower half float conversions
357 // into library calls.
358 if (Subtarget.useSoftFloat() ||
359 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
360 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
361 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
364 // There's never any support for operations beyond MVT::f32.
365 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
366 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
367 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
368 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
370 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
371 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
372 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
373 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
374 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
375 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
377 if (Subtarget.hasPOPCNT()) {
378 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
380 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
381 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
382 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
383 if (Subtarget.is64Bit())
384 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
387 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
389 if (!Subtarget.hasMOVBE())
390 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
392 // These should be promoted to a larger select which is supported.
393 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
394 // X86 wants to expand cmov itself.
395 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
396 setOperationAction(ISD::SELECT, VT, Custom);
397 setOperationAction(ISD::SETCC, VT, Custom);
399 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
400 if (VT == MVT::i64 && !Subtarget.is64Bit())
402 setOperationAction(ISD::SELECT, VT, Custom);
403 setOperationAction(ISD::SETCC, VT, Custom);
404 setOperationAction(ISD::SETCCE, VT, Custom);
406 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
407 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
408 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
409 // support continuation, user-level threading, and etc.. As a result, no
410 // other SjLj exception interfaces are implemented and please don't build
411 // your own exception handling based on them.
412 // LLVM/Clang supports zero-cost DWARF exception handling.
413 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
414 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
415 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
416 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
417 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
420 for (auto VT : { MVT::i32, MVT::i64 }) {
421 if (VT == MVT::i64 && !Subtarget.is64Bit())
423 setOperationAction(ISD::ConstantPool , VT, Custom);
424 setOperationAction(ISD::JumpTable , VT, Custom);
425 setOperationAction(ISD::GlobalAddress , VT, Custom);
426 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
427 setOperationAction(ISD::ExternalSymbol , VT, Custom);
428 setOperationAction(ISD::BlockAddress , VT, Custom);
430 // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
431 for (auto VT : { MVT::i32, MVT::i64 }) {
432 if (VT == MVT::i64 && !Subtarget.is64Bit())
434 setOperationAction(ISD::SHL_PARTS, VT, Custom);
435 setOperationAction(ISD::SRA_PARTS, VT, Custom);
436 setOperationAction(ISD::SRL_PARTS, VT, Custom);
439 if (Subtarget.hasSSE1())
440 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
442 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
444 // Expand certain atomics
445 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
446 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
447 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
448 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
449 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
450 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
451 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
452 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
455 if (Subtarget.hasCmpxchg16b()) {
456 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
459 // FIXME - use subtarget debug flags
460 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
461 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
462 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
463 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
466 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
467 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
469 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
470 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
472 setOperationAction(ISD::TRAP, MVT::Other, Legal);
473 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
475 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
476 setOperationAction(ISD::VASTART , MVT::Other, Custom);
477 setOperationAction(ISD::VAEND , MVT::Other, Expand);
478 bool Is64Bit = Subtarget.is64Bit();
479 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
480 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
482 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
483 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
485 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
487 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
488 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
489 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
491 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
492 // f32 and f64 use SSE.
493 // Set up the FP register classes.
494 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
495 : &X86::FR32RegClass);
496 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
497 : &X86::FR64RegClass);
499 for (auto VT : { MVT::f32, MVT::f64 }) {
500 // Use ANDPD to simulate FABS.
501 setOperationAction(ISD::FABS, VT, Custom);
503 // Use XORP to simulate FNEG.
504 setOperationAction(ISD::FNEG, VT, Custom);
506 // Use ANDPD and ORPD to simulate FCOPYSIGN.
507 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
509 // We don't support sin/cos/fmod
510 setOperationAction(ISD::FSIN , VT, Expand);
511 setOperationAction(ISD::FCOS , VT, Expand);
512 setOperationAction(ISD::FSINCOS, VT, Expand);
515 // Lower this to MOVMSK plus an AND.
516 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
517 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
519 // Expand FP immediates into loads from the stack, except for the special
521 addLegalFPImmediate(APFloat(+0.0)); // xorpd
522 addLegalFPImmediate(APFloat(+0.0f)); // xorps
523 } else if (UseX87 && X86ScalarSSEf32) {
524 // Use SSE for f32, x87 for f64.
525 // Set up the FP register classes.
526 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
527 : &X86::FR32RegClass);
528 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
530 // Use ANDPS to simulate FABS.
531 setOperationAction(ISD::FABS , MVT::f32, Custom);
533 // Use XORP to simulate FNEG.
534 setOperationAction(ISD::FNEG , MVT::f32, Custom);
536 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
538 // Use ANDPS and ORPS to simulate FCOPYSIGN.
539 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
540 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
542 // We don't support sin/cos/fmod
543 setOperationAction(ISD::FSIN , MVT::f32, Expand);
544 setOperationAction(ISD::FCOS , MVT::f32, Expand);
545 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
547 // Special cases we handle for FP constants.
548 addLegalFPImmediate(APFloat(+0.0f)); // xorps
549 addLegalFPImmediate(APFloat(+0.0)); // FLD0
550 addLegalFPImmediate(APFloat(+1.0)); // FLD1
551 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
552 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
554 if (!TM.Options.UnsafeFPMath) {
555 setOperationAction(ISD::FSIN , MVT::f64, Expand);
556 setOperationAction(ISD::FCOS , MVT::f64, Expand);
557 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
560 // f32 and f64 in x87.
561 // Set up the FP register classes.
562 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
563 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
565 for (auto VT : { MVT::f32, MVT::f64 }) {
566 setOperationAction(ISD::UNDEF, VT, Expand);
567 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
569 if (!TM.Options.UnsafeFPMath) {
570 setOperationAction(ISD::FSIN , VT, Expand);
571 setOperationAction(ISD::FCOS , VT, Expand);
572 setOperationAction(ISD::FSINCOS, VT, Expand);
575 addLegalFPImmediate(APFloat(+0.0)); // FLD0
576 addLegalFPImmediate(APFloat(+1.0)); // FLD1
577 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
578 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
579 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
580 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
581 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
582 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
585 // We don't support FMA.
586 setOperationAction(ISD::FMA, MVT::f64, Expand);
587 setOperationAction(ISD::FMA, MVT::f32, Expand);
589 // Long double always uses X87, except f128 in MMX.
591 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
592 addRegisterClass(MVT::f128, &X86::FR128RegClass);
593 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
594 setOperationAction(ISD::FABS , MVT::f128, Custom);
595 setOperationAction(ISD::FNEG , MVT::f128, Custom);
596 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
599 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
600 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
601 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
603 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
604 addLegalFPImmediate(TmpFlt); // FLD0
606 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
609 APFloat TmpFlt2(+1.0);
610 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
612 addLegalFPImmediate(TmpFlt2); // FLD1
613 TmpFlt2.changeSign();
614 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
617 if (!TM.Options.UnsafeFPMath) {
618 setOperationAction(ISD::FSIN , MVT::f80, Expand);
619 setOperationAction(ISD::FCOS , MVT::f80, Expand);
620 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
623 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
624 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
625 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
626 setOperationAction(ISD::FRINT, MVT::f80, Expand);
627 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
628 setOperationAction(ISD::FMA, MVT::f80, Expand);
631 // Always use a library call for pow.
632 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
633 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
634 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
636 setOperationAction(ISD::FLOG, MVT::f80, Expand);
637 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
638 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
639 setOperationAction(ISD::FEXP, MVT::f80, Expand);
640 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
641 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
642 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
644 // Some FP actions are always expanded for vector types.
645 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
646 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
647 setOperationAction(ISD::FSIN, VT, Expand);
648 setOperationAction(ISD::FSINCOS, VT, Expand);
649 setOperationAction(ISD::FCOS, VT, Expand);
650 setOperationAction(ISD::FREM, VT, Expand);
651 setOperationAction(ISD::FPOWI, VT, Expand);
652 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
653 setOperationAction(ISD::FPOW, VT, Expand);
654 setOperationAction(ISD::FLOG, VT, Expand);
655 setOperationAction(ISD::FLOG2, VT, Expand);
656 setOperationAction(ISD::FLOG10, VT, Expand);
657 setOperationAction(ISD::FEXP, VT, Expand);
658 setOperationAction(ISD::FEXP2, VT, Expand);
661 // First set operation action for all vector types to either promote
662 // (for widening) or expand (for scalarization). Then we will selectively
663 // turn on ones that can be effectively codegen'd.
664 for (MVT VT : MVT::vector_valuetypes()) {
665 setOperationAction(ISD::SDIV, VT, Expand);
666 setOperationAction(ISD::UDIV, VT, Expand);
667 setOperationAction(ISD::SREM, VT, Expand);
668 setOperationAction(ISD::UREM, VT, Expand);
669 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
670 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
671 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
672 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
673 setOperationAction(ISD::FMA, VT, Expand);
674 setOperationAction(ISD::FFLOOR, VT, Expand);
675 setOperationAction(ISD::FCEIL, VT, Expand);
676 setOperationAction(ISD::FTRUNC, VT, Expand);
677 setOperationAction(ISD::FRINT, VT, Expand);
678 setOperationAction(ISD::FNEARBYINT, VT, Expand);
679 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
680 setOperationAction(ISD::MULHS, VT, Expand);
681 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
682 setOperationAction(ISD::MULHU, VT, Expand);
683 setOperationAction(ISD::SDIVREM, VT, Expand);
684 setOperationAction(ISD::UDIVREM, VT, Expand);
685 setOperationAction(ISD::CTPOP, VT, Expand);
686 setOperationAction(ISD::CTTZ, VT, Expand);
687 setOperationAction(ISD::CTLZ, VT, Expand);
688 setOperationAction(ISD::ROTL, VT, Expand);
689 setOperationAction(ISD::ROTR, VT, Expand);
690 setOperationAction(ISD::BSWAP, VT, Expand);
691 setOperationAction(ISD::SETCC, VT, Expand);
692 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
693 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
694 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
695 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
696 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
697 setOperationAction(ISD::TRUNCATE, VT, Expand);
698 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
699 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
700 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
701 setOperationAction(ISD::SELECT_CC, VT, Expand);
702 for (MVT InnerVT : MVT::vector_valuetypes()) {
703 setTruncStoreAction(InnerVT, VT, Expand);
705 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
706 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
708 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
709 // types, we have to deal with them whether we ask for Expansion or not.
710 // Setting Expand causes its own optimisation problems though, so leave
712 if (VT.getVectorElementType() == MVT::i1)
713 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
715 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
716 // split/scalarized right now.
717 if (VT.getVectorElementType() == MVT::f16)
718 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
722 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
723 // with -msoft-float, disable use of MMX as well.
724 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
725 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
726 // No operations on x86mmx supported, everything uses intrinsics.
729 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
730 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
731 : &X86::VR128RegClass);
733 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
734 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
735 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
736 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
737 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
738 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
739 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
740 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
741 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
744 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
745 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
746 : &X86::VR128RegClass);
748 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
749 // registers cannot be used even for integer operations.
750 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
751 : &X86::VR128RegClass);
752 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
753 : &X86::VR128RegClass);
754 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
755 : &X86::VR128RegClass);
756 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
757 : &X86::VR128RegClass);
759 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
760 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
761 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
762 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
763 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
764 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
765 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
766 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
767 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
768 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
769 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
770 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
771 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
773 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
774 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
775 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
776 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
778 setOperationAction(ISD::SETCC, MVT::v2i64, Custom);
779 setOperationAction(ISD::SETCC, MVT::v16i8, Custom);
780 setOperationAction(ISD::SETCC, MVT::v8i16, Custom);
781 setOperationAction(ISD::SETCC, MVT::v4i32, Custom);
783 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v16i8, Custom);
784 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v8i16, Custom);
785 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
786 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
787 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
789 setOperationAction(ISD::CTPOP, MVT::v16i8, Custom);
790 setOperationAction(ISD::CTPOP, MVT::v8i16, Custom);
791 setOperationAction(ISD::CTPOP, MVT::v4i32, Custom);
792 setOperationAction(ISD::CTPOP, MVT::v2i64, Custom);
794 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
795 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
796 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
797 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
799 // Custom lower build_vector, vector_shuffle, and extract_vector_elt.
800 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
801 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
802 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
803 setOperationAction(ISD::VSELECT, VT, Custom);
804 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
807 // We support custom legalizing of sext and anyext loads for specific
808 // memory vector types which we can load as a scalar (or sequence of
809 // scalars) and extend in-register to a legal 128-bit vector type. For sext
810 // loads these must work with a single scalar load.
811 for (MVT VT : MVT::integer_vector_valuetypes()) {
812 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
813 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
814 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
815 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
816 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
817 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
818 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
819 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
823 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
824 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
825 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
826 setOperationAction(ISD::VSELECT, VT, Custom);
828 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
831 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
832 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
835 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
836 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
837 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
838 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
839 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
840 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
841 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
844 // Custom lower v2i64 and v2f64 selects.
845 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
846 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
848 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
849 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
851 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
852 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
854 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
855 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
856 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
858 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
859 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
861 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
862 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
864 for (MVT VT : MVT::fp_vector_valuetypes())
865 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
867 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
868 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
869 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
871 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
872 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
873 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
875 for (auto VT : { MVT::v8i16, MVT::v16i8 }) {
876 setOperationAction(ISD::SRL, VT, Custom);
877 setOperationAction(ISD::SHL, VT, Custom);
878 setOperationAction(ISD::SRA, VT, Custom);
881 // In the customized shift lowering, the legal cases in AVX2 will be
883 for (auto VT : { MVT::v4i32, MVT::v2i64 }) {
884 setOperationAction(ISD::SRL, VT, Custom);
885 setOperationAction(ISD::SHL, VT, Custom);
886 setOperationAction(ISD::SRA, VT, Custom);
890 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
891 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
892 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
893 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
894 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
895 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
898 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
899 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
900 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
901 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
902 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
903 setOperationAction(ISD::FRINT, RoundedTy, Legal);
904 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
907 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
908 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
909 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
910 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
911 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
912 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
913 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
914 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
916 // FIXME: Do we need to handle scalar-to-vector here?
917 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
919 // We directly match byte blends in the backend as they match the VSELECT
921 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
923 // SSE41 brings specific instructions for doing vector sign extend even in
924 // cases where we don't have SRA.
925 for (MVT VT : MVT::integer_vector_valuetypes()) {
926 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
927 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
928 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
931 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
932 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
933 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
934 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
935 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
936 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
937 setLoadExtAction(ISD::SEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
939 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i16, MVT::v8i8, Legal);
940 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
941 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
942 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
943 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
944 setLoadExtAction(ISD::ZEXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
946 // i8 vectors are custom because the source register and source
947 // source memory operand types are not the same width.
948 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
951 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
952 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
953 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
954 setOperationAction(ISD::ROTL, VT, Custom);
956 // XOP can efficiently perform BITREVERSE with VPPERM.
957 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
958 setOperationAction(ISD::BITREVERSE, VT, Custom);
960 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
961 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
962 setOperationAction(ISD::BITREVERSE, VT, Custom);
965 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
966 bool HasInt256 = Subtarget.hasInt256();
968 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
969 : &X86::VR256RegClass);
970 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
976 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
981 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
982 setOperationAction(ISD::FFLOOR, VT, Legal);
983 setOperationAction(ISD::FCEIL, VT, Legal);
984 setOperationAction(ISD::FTRUNC, VT, Legal);
985 setOperationAction(ISD::FRINT, VT, Legal);
986 setOperationAction(ISD::FNEARBYINT, VT, Legal);
987 setOperationAction(ISD::FNEG, VT, Custom);
988 setOperationAction(ISD::FABS, VT, Custom);
989 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
992 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
993 // even though v8i16 is a legal type.
994 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
995 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
996 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
998 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
999 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1000 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1002 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1003 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1005 for (MVT VT : MVT::fp_vector_valuetypes())
1006 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1008 for (auto VT : { MVT::v32i8, MVT::v16i16 }) {
1009 setOperationAction(ISD::SRL, VT, Custom);
1010 setOperationAction(ISD::SHL, VT, Custom);
1011 setOperationAction(ISD::SRA, VT, Custom);
1014 setOperationAction(ISD::SETCC, MVT::v32i8, Custom);
1015 setOperationAction(ISD::SETCC, MVT::v16i16, Custom);
1016 setOperationAction(ISD::SETCC, MVT::v8i32, Custom);
1017 setOperationAction(ISD::SETCC, MVT::v4i64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1020 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1021 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1023 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i64, Custom);
1024 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i32, Custom);
1025 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1026 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i64, Custom);
1027 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i32, Custom);
1028 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i16, Custom);
1029 setOperationAction(ISD::ANY_EXTEND, MVT::v4i64, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, MVT::v8i32, Custom);
1031 setOperationAction(ISD::ANY_EXTEND, MVT::v16i16, Custom);
1032 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1035 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1037 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1038 setOperationAction(ISD::CTPOP, VT, Custom);
1039 setOperationAction(ISD::CTTZ, VT, Custom);
1040 setOperationAction(ISD::CTLZ, VT, Custom);
1043 if (Subtarget.hasAnyFMA()) {
1044 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1045 MVT::v2f64, MVT::v4f64 })
1046 setOperationAction(ISD::FMA, VT, Legal);
1049 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1050 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1055 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1057 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1059 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1060 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1063 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1064 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1065 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1067 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1068 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1069 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1070 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1076 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1077 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1079 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1080 // when we have a 256bit-wide blend with immediate.
1081 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1083 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1084 setLoadExtAction(ISD::SEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1085 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1086 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1087 setLoadExtAction(ISD::SEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1088 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1089 setLoadExtAction(ISD::SEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1091 setLoadExtAction(ISD::ZEXTLOAD, MVT::v16i16, MVT::v16i8, Legal);
1092 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1093 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1094 setLoadExtAction(ISD::ZEXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1095 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1096 setLoadExtAction(ISD::ZEXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1099 // In the customized shift lowering, the legal cases in AVX2 will be
1101 for (auto VT : { MVT::v8i32, MVT::v4i64 }) {
1102 setOperationAction(ISD::SRL, VT, Custom);
1103 setOperationAction(ISD::SHL, VT, Custom);
1104 setOperationAction(ISD::SRA, VT, Custom);
1107 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1108 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1109 setOperationAction(ISD::MLOAD, VT, Legal);
1110 setOperationAction(ISD::MSTORE, VT, Legal);
1113 // Extract subvector is special because the value type
1114 // (result) is 128-bit but the source is 256-bit wide.
1115 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1116 MVT::v4f32, MVT::v2f64 }) {
1117 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1120 // Custom lower several nodes for 256-bit types.
1121 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1122 MVT::v8f32, MVT::v4f64 }) {
1123 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1124 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1125 setOperationAction(ISD::VSELECT, VT, Custom);
1126 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1127 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1128 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1129 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1130 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1134 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1136 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1137 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1138 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1139 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1140 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1141 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1142 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1146 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1147 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1148 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1149 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1150 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1152 addRegisterClass(MVT::i1, &X86::VK1RegClass);
1153 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1154 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1156 for (MVT VT : MVT::fp_vector_valuetypes())
1157 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1159 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1160 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1161 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1162 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1163 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1164 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1165 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1167 setOperationAction(ISD::BR_CC, MVT::i1, Expand);
1168 setOperationAction(ISD::SETCC, MVT::i1, Custom);
1169 setOperationAction(ISD::SETCCE, MVT::i1, Custom);
1170 setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
1171 setOperationAction(ISD::XOR, MVT::i1, Legal);
1172 setOperationAction(ISD::OR, MVT::i1, Legal);
1173 setOperationAction(ISD::AND, MVT::i1, Legal);
1174 setOperationAction(ISD::SUB, MVT::i1, Custom);
1175 setOperationAction(ISD::ADD, MVT::i1, Custom);
1176 setOperationAction(ISD::MUL, MVT::i1, Custom);
1178 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1179 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1180 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1181 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1182 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1183 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1184 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1185 setTruncStoreAction(VT, MaskVT, Custom);
1188 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1189 setOperationAction(ISD::FNEG, VT, Custom);
1190 setOperationAction(ISD::FABS, VT, Custom);
1191 setOperationAction(ISD::FMA, VT, Legal);
1192 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1195 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1196 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1197 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1198 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1199 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1200 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1201 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1202 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1203 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1204 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1205 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1206 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1207 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1208 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1209 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1210 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1211 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1212 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1213 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1214 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1215 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1216 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1217 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1218 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1219 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1221 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1222 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1223 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1224 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1225 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1226 if (Subtarget.hasVLX()){
1227 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1228 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1229 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1230 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1231 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1233 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1234 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1235 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1236 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1237 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1239 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1240 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1241 setOperationAction(ISD::MLOAD, VT, Custom);
1242 setOperationAction(ISD::MSTORE, VT, Custom);
1245 setOperationAction(ISD::TRUNCATE, MVT::i1, Custom);
1246 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1247 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1248 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i1, Custom);
1249 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i1, Custom);
1250 setOperationAction(ISD::VSELECT, MVT::v8i1, Expand);
1251 setOperationAction(ISD::VSELECT, MVT::v16i1, Expand);
1252 if (Subtarget.hasDQI()) {
1253 setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
1254 setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Legal);
1255 setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
1256 setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
1257 setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Legal);
1258 setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
1259 setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
1260 setOperationAction(ISD::FP_TO_SINT, MVT::v4i64, Legal);
1261 setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
1262 setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
1263 setOperationAction(ISD::FP_TO_UINT, MVT::v4i64, Legal);
1264 setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
1266 if (Subtarget.hasVLX()) {
1267 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1268 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1269 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1270 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1273 if (Subtarget.hasVLX()) {
1274 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1275 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1276 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1277 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1278 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1279 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1280 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1281 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1282 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1283 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1284 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1286 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1287 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1288 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1289 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1290 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1291 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1292 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1293 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1294 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1295 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1296 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1299 setOperationAction(ISD::TRUNCATE, MVT::v8i1, Custom);
1300 setOperationAction(ISD::TRUNCATE, MVT::v16i1, Custom);
1301 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1302 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1303 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1304 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1305 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1306 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1307 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1308 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1309 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1310 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1312 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1313 setOperationAction(ISD::FFLOOR, VT, Legal);
1314 setOperationAction(ISD::FCEIL, VT, Legal);
1315 setOperationAction(ISD::FTRUNC, VT, Legal);
1316 setOperationAction(ISD::FRINT, VT, Legal);
1317 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1320 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1321 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1323 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1324 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1325 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1327 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1328 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1329 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1330 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1331 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1333 setOperationAction(ISD::SETCC, MVT::v16i1, Custom);
1334 setOperationAction(ISD::SETCC, MVT::v8i1, Custom);
1336 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1338 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v8i1, Custom);
1339 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v16i1, Custom);
1340 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1341 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i1, Custom);
1342 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i1, Custom);
1343 setOperationAction(ISD::BUILD_VECTOR, MVT::v8i1, Custom);
1344 setOperationAction(ISD::BUILD_VECTOR, MVT::v16i1, Custom);
1345 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1346 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1347 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1348 setOperationAction(ISD::SELECT, MVT::v16i1, Custom);
1349 setOperationAction(ISD::SELECT, MVT::v8i1, Custom);
1351 setOperationAction(ISD::SMAX, MVT::v16i32, Legal);
1352 setOperationAction(ISD::SMAX, MVT::v8i64, Legal);
1353 setOperationAction(ISD::UMAX, MVT::v16i32, Legal);
1354 setOperationAction(ISD::UMAX, MVT::v8i64, Legal);
1355 setOperationAction(ISD::SMIN, MVT::v16i32, Legal);
1356 setOperationAction(ISD::SMIN, MVT::v8i64, Legal);
1357 setOperationAction(ISD::UMIN, MVT::v16i32, Legal);
1358 setOperationAction(ISD::UMIN, MVT::v8i64, Legal);
1360 setOperationAction(ISD::ADD, MVT::v8i1, Expand);
1361 setOperationAction(ISD::ADD, MVT::v16i1, Expand);
1362 setOperationAction(ISD::SUB, MVT::v8i1, Expand);
1363 setOperationAction(ISD::SUB, MVT::v16i1, Expand);
1364 setOperationAction(ISD::MUL, MVT::v8i1, Expand);
1365 setOperationAction(ISD::MUL, MVT::v16i1, Expand);
1367 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1369 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1370 setOperationAction(ISD::SRL, VT, Custom);
1371 setOperationAction(ISD::SHL, VT, Custom);
1372 setOperationAction(ISD::SRA, VT, Custom);
1373 setOperationAction(ISD::CTPOP, VT, Custom);
1374 setOperationAction(ISD::CTTZ, VT, Custom);
1377 // Need to promote to 64-bit even though we have 32-bit masked instructions
1378 // because the IR optimizers rearrange bitcasts around logic ops leaving
1379 // too many variations to handle if we don't promote them.
1380 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1381 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1382 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1384 if (Subtarget.hasCDI()) {
1385 setOperationAction(ISD::CTLZ, MVT::v8i64, Legal);
1386 setOperationAction(ISD::CTLZ, MVT::v16i32, Legal);
1388 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
1389 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
1390 setOperationAction(ISD::CTLZ, MVT::v16i16, Custom);
1391 setOperationAction(ISD::CTLZ, MVT::v32i8, Custom);
1393 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i64, Custom);
1394 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v16i32, Custom);
1396 if (Subtarget.hasVLX()) {
1397 setOperationAction(ISD::CTLZ, MVT::v4i64, Legal);
1398 setOperationAction(ISD::CTLZ, MVT::v8i32, Legal);
1399 setOperationAction(ISD::CTLZ, MVT::v2i64, Legal);
1400 setOperationAction(ISD::CTLZ, MVT::v4i32, Legal);
1402 setOperationAction(ISD::CTLZ, MVT::v4i64, Custom);
1403 setOperationAction(ISD::CTLZ, MVT::v8i32, Custom);
1404 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
1405 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
1408 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i64, Custom);
1409 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v8i32, Custom);
1410 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v2i64, Custom);
1411 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::v4i32, Custom);
1412 } // Subtarget.hasCDI()
1414 if (Subtarget.hasDQI()) {
1415 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1416 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1417 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1418 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1421 // Custom lower several nodes.
1422 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1423 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1424 setOperationAction(ISD::MGATHER, VT, Custom);
1425 setOperationAction(ISD::MSCATTER, VT, Custom);
1427 // Extract subvector is special because the value type
1428 // (result) is 256-bit but the source is 512-bit wide.
1429 // 128-bit was made Custom under AVX1.
1430 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1431 MVT::v8f32, MVT::v4f64 })
1432 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1433 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1434 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1435 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1437 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1438 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1439 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1440 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1441 setOperationAction(ISD::VSELECT, VT, Legal);
1442 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1443 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1444 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
1445 setOperationAction(ISD::MLOAD, VT, Legal);
1446 setOperationAction(ISD::MSTORE, VT, Legal);
1447 setOperationAction(ISD::MGATHER, VT, Legal);
1448 setOperationAction(ISD::MSCATTER, VT, Custom);
1450 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1451 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1452 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1456 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1457 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1458 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1460 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1461 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1463 setOperationAction(ISD::ADD, MVT::v32i1, Expand);
1464 setOperationAction(ISD::ADD, MVT::v64i1, Expand);
1465 setOperationAction(ISD::SUB, MVT::v32i1, Expand);
1466 setOperationAction(ISD::SUB, MVT::v64i1, Expand);
1467 setOperationAction(ISD::MUL, MVT::v32i1, Expand);
1468 setOperationAction(ISD::MUL, MVT::v64i1, Expand);
1470 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1471 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1472 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1473 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1474 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1475 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1476 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1477 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1478 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1479 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1480 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1481 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1482 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Custom);
1483 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Custom);
1484 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1485 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1486 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1487 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1488 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1489 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1490 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1491 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1492 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1493 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1494 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1495 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1496 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1497 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1498 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1499 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1500 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1501 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1502 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1503 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1504 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1505 setOperationAction(ISD::VSELECT, MVT::v32i16, Legal);
1506 setOperationAction(ISD::VSELECT, MVT::v64i8, Legal);
1507 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1508 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1509 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1510 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1511 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1512 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1513 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1514 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1515 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1516 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1518 setOperationAction(ISD::SMAX, MVT::v64i8, Legal);
1519 setOperationAction(ISD::SMAX, MVT::v32i16, Legal);
1520 setOperationAction(ISD::UMAX, MVT::v64i8, Legal);
1521 setOperationAction(ISD::UMAX, MVT::v32i16, Legal);
1522 setOperationAction(ISD::SMIN, MVT::v64i8, Legal);
1523 setOperationAction(ISD::SMIN, MVT::v32i16, Legal);
1524 setOperationAction(ISD::UMIN, MVT::v64i8, Legal);
1525 setOperationAction(ISD::UMIN, MVT::v32i16, Legal);
1527 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1529 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1530 if (Subtarget.hasVLX()) {
1531 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1532 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1535 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1536 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1537 setOperationAction(ISD::MLOAD, VT, Action);
1538 setOperationAction(ISD::MSTORE, VT, Action);
1541 if (Subtarget.hasCDI()) {
1542 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1543 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1546 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1547 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1548 setOperationAction(ISD::VSELECT, VT, Legal);
1549 setOperationAction(ISD::SRL, VT, Custom);
1550 setOperationAction(ISD::SHL, VT, Custom);
1551 setOperationAction(ISD::SRA, VT, Custom);
1552 setOperationAction(ISD::MLOAD, VT, Legal);
1553 setOperationAction(ISD::MSTORE, VT, Legal);
1554 setOperationAction(ISD::CTPOP, VT, Custom);
1555 setOperationAction(ISD::CTTZ, VT, Custom);
1557 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1558 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1559 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1562 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1563 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1564 if (Subtarget.hasVLX()) {
1565 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1566 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1567 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1572 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1573 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1574 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1576 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1577 setOperationAction(ISD::ADD, VT, Expand);
1578 setOperationAction(ISD::SUB, VT, Expand);
1579 setOperationAction(ISD::MUL, VT, Expand);
1580 setOperationAction(ISD::VSELECT, VT, Expand);
1582 setOperationAction(ISD::TRUNCATE, VT, Custom);
1583 setOperationAction(ISD::SETCC, VT, Custom);
1584 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1585 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1586 setOperationAction(ISD::SELECT, VT, Custom);
1587 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1588 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1591 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1592 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1593 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1594 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1596 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1597 setOperationAction(ISD::SMAX, VT, Legal);
1598 setOperationAction(ISD::UMAX, VT, Legal);
1599 setOperationAction(ISD::SMIN, VT, Legal);
1600 setOperationAction(ISD::UMIN, VT, Legal);
1604 // We want to custom lower some of our intrinsics.
1605 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1606 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1607 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1608 if (!Subtarget.is64Bit()) {
1609 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1610 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1613 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1614 // handle type legalization for these operations here.
1616 // FIXME: We really should do custom legalization for addition and
1617 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1618 // than generic legalization for 64-bit multiplication-with-overflow, though.
1619 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1620 if (VT == MVT::i64 && !Subtarget.is64Bit())
1622 // Add/Sub/Mul with overflow operations are custom lowered.
1623 setOperationAction(ISD::SADDO, VT, Custom);
1624 setOperationAction(ISD::UADDO, VT, Custom);
1625 setOperationAction(ISD::SSUBO, VT, Custom);
1626 setOperationAction(ISD::USUBO, VT, Custom);
1627 setOperationAction(ISD::SMULO, VT, Custom);
1628 setOperationAction(ISD::UMULO, VT, Custom);
1631 if (!Subtarget.is64Bit()) {
1632 // These libcalls are not available in 32-bit.
1633 setLibcallName(RTLIB::SHL_I128, nullptr);
1634 setLibcallName(RTLIB::SRL_I128, nullptr);
1635 setLibcallName(RTLIB::SRA_I128, nullptr);
1638 // Combine sin / cos into one node or libcall if possible.
1639 if (Subtarget.hasSinCos()) {
1640 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1641 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1642 if (Subtarget.isTargetDarwin()) {
1643 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1644 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1645 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1646 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1650 if (Subtarget.isTargetWin64()) {
1651 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1652 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1653 setOperationAction(ISD::SREM, MVT::i128, Custom);
1654 setOperationAction(ISD::UREM, MVT::i128, Custom);
1655 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1656 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1659 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1660 // is. We should promote the value to 64-bits to solve this.
1661 // This is what the CRT headers do - `fmodf` is an inline header
1662 // function casting to f64 and calling `fmod`.
1663 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1664 Subtarget.isTargetWindowsItanium()))
1665 for (ISD::NodeType Op :
1666 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1667 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1668 if (isOperationExpand(Op, MVT::f32))
1669 setOperationAction(Op, MVT::f32, Promote);
1671 // We have target-specific dag combine patterns for the following nodes:
1672 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1673 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1674 setTargetDAGCombine(ISD::BITCAST);
1675 setTargetDAGCombine(ISD::VSELECT);
1676 setTargetDAGCombine(ISD::SELECT);
1677 setTargetDAGCombine(ISD::SHL);
1678 setTargetDAGCombine(ISD::SRA);
1679 setTargetDAGCombine(ISD::SRL);
1680 setTargetDAGCombine(ISD::OR);
1681 setTargetDAGCombine(ISD::AND);
1682 setTargetDAGCombine(ISD::ADD);
1683 setTargetDAGCombine(ISD::FADD);
1684 setTargetDAGCombine(ISD::FSUB);
1685 setTargetDAGCombine(ISD::FNEG);
1686 setTargetDAGCombine(ISD::FMA);
1687 setTargetDAGCombine(ISD::FMINNUM);
1688 setTargetDAGCombine(ISD::FMAXNUM);
1689 setTargetDAGCombine(ISD::SUB);
1690 setTargetDAGCombine(ISD::LOAD);
1691 setTargetDAGCombine(ISD::MLOAD);
1692 setTargetDAGCombine(ISD::STORE);
1693 setTargetDAGCombine(ISD::MSTORE);
1694 setTargetDAGCombine(ISD::TRUNCATE);
1695 setTargetDAGCombine(ISD::ZERO_EXTEND);
1696 setTargetDAGCombine(ISD::ANY_EXTEND);
1697 setTargetDAGCombine(ISD::SIGN_EXTEND);
1698 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1699 setTargetDAGCombine(ISD::SINT_TO_FP);
1700 setTargetDAGCombine(ISD::UINT_TO_FP);
1701 setTargetDAGCombine(ISD::SETCC);
1702 setTargetDAGCombine(ISD::MUL);
1703 setTargetDAGCombine(ISD::XOR);
1704 setTargetDAGCombine(ISD::MSCATTER);
1705 setTargetDAGCombine(ISD::MGATHER);
1707 computeRegisterProperties(Subtarget.getRegisterInfo());
1709 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1710 MaxStoresPerMemsetOptSize = 8;
1711 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1712 MaxStoresPerMemcpyOptSize = 4;
1713 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1714 MaxStoresPerMemmoveOptSize = 4;
1715 setPrefLoopAlignment(4); // 2^4 bytes.
1717 // An out-of-order CPU can speculatively execute past a predictable branch,
1718 // but a conditional move could be stalled by an expensive earlier operation.
1719 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1720 EnableExtLdPromotion = true;
1721 setPrefFunctionAlignment(4); // 2^4 bytes.
1723 verifyIntrinsicTables();
1726 // This has so far only been implemented for 64-bit MachO.
1727 bool X86TargetLowering::useLoadStackGuardNode() const {
1728 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1731 TargetLoweringBase::LegalizeTypeAction
1732 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1733 if (ExperimentalVectorWideningLegalization &&
1734 VT.getVectorNumElements() != 1 &&
1735 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1736 return TypeWidenVector;
1738 return TargetLoweringBase::getPreferredVectorAction(VT);
1741 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1742 LLVMContext& Context,
1745 return Subtarget.hasAVX512() ? MVT::i1: MVT::i8;
1747 if (VT.isSimple()) {
1748 MVT VVT = VT.getSimpleVT();
1749 const unsigned NumElts = VVT.getVectorNumElements();
1750 MVT EltVT = VVT.getVectorElementType();
1751 if (VVT.is512BitVector()) {
1752 if (Subtarget.hasAVX512())
1753 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1754 EltVT == MVT::f32 || EltVT == MVT::f64)
1756 case 8: return MVT::v8i1;
1757 case 16: return MVT::v16i1;
1759 if (Subtarget.hasBWI())
1760 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1762 case 32: return MVT::v32i1;
1763 case 64: return MVT::v64i1;
1767 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1768 return MVT::getVectorVT(MVT::i1, NumElts);
1770 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1771 EVT LegalVT = getTypeToTransformTo(Context, VT);
1772 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1775 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1777 case 2: return MVT::v2i1;
1778 case 4: return MVT::v4i1;
1779 case 8: return MVT::v8i1;
1783 return VT.changeVectorElementTypeToInteger();
1786 /// Helper for getByValTypeAlignment to determine
1787 /// the desired ByVal argument alignment.
1788 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1791 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1792 if (VTy->getBitWidth() == 128)
1794 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1795 unsigned EltAlign = 0;
1796 getMaxByValAlign(ATy->getElementType(), EltAlign);
1797 if (EltAlign > MaxAlign)
1798 MaxAlign = EltAlign;
1799 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1800 for (auto *EltTy : STy->elements()) {
1801 unsigned EltAlign = 0;
1802 getMaxByValAlign(EltTy, EltAlign);
1803 if (EltAlign > MaxAlign)
1804 MaxAlign = EltAlign;
1811 /// Return the desired alignment for ByVal aggregate
1812 /// function arguments in the caller parameter area. For X86, aggregates
1813 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1814 /// are at 4-byte boundaries.
1815 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1816 const DataLayout &DL) const {
1817 if (Subtarget.is64Bit()) {
1818 // Max of 8 and alignment of type.
1819 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1826 if (Subtarget.hasSSE1())
1827 getMaxByValAlign(Ty, Align);
1831 /// Returns the target specific optimal type for load
1832 /// and store operations as a result of memset, memcpy, and memmove
1833 /// lowering. If DstAlign is zero that means it's safe to destination
1834 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1835 /// means there isn't a need to check it against alignment requirement,
1836 /// probably because the source does not need to be loaded. If 'IsMemset' is
1837 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1838 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1839 /// source is constant so it does not need to be loaded.
1840 /// It returns EVT::Other if the type should be determined using generic
1841 /// target-independent logic.
1843 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1844 unsigned DstAlign, unsigned SrcAlign,
1845 bool IsMemset, bool ZeroMemset,
1847 MachineFunction &MF) const {
1848 const Function *F = MF.getFunction();
1849 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1851 (!Subtarget.isUnalignedMem16Slow() ||
1852 ((DstAlign == 0 || DstAlign >= 16) &&
1853 (SrcAlign == 0 || SrcAlign >= 16)))) {
1854 // FIXME: Check if unaligned 32-byte accesses are slow.
1855 if (Size >= 32 && Subtarget.hasAVX()) {
1856 // Although this isn't a well-supported type for AVX1, we'll let
1857 // legalization and shuffle lowering produce the optimal codegen. If we
1858 // choose an optimal type with a vector element larger than a byte,
1859 // getMemsetStores() may create an intermediate splat (using an integer
1860 // multiply) before we splat as a vector.
1863 if (Subtarget.hasSSE2())
1865 // TODO: Can SSE1 handle a byte vector?
1866 if (Subtarget.hasSSE1())
1868 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1869 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1870 // Do not use f64 to lower memcpy if source is string constant. It's
1871 // better to use i32 to avoid the loads.
1872 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1873 // The gymnastics of splatting a byte value into an XMM register and then
1874 // only using 8-byte stores (because this is a CPU with slow unaligned
1875 // 16-byte accesses) makes that a loser.
1879 // This is a compromise. If we reach here, unaligned accesses may be slow on
1880 // this target. However, creating smaller, aligned accesses could be even
1881 // slower and would certainly be a lot more code.
1882 if (Subtarget.is64Bit() && Size >= 8)
1887 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1889 return X86ScalarSSEf32;
1890 else if (VT == MVT::f64)
1891 return X86ScalarSSEf64;
1896 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1901 switch (VT.getSizeInBits()) {
1903 // 8-byte and under are always assumed to be fast.
1907 *Fast = !Subtarget.isUnalignedMem16Slow();
1910 *Fast = !Subtarget.isUnalignedMem32Slow();
1912 // TODO: What about AVX-512 (512-bit) accesses?
1915 // Misaligned accesses of any size are always allowed.
1919 /// Return the entry encoding for a jump table in the
1920 /// current function. The returned value is a member of the
1921 /// MachineJumpTableInfo::JTEntryKind enum.
1922 unsigned X86TargetLowering::getJumpTableEncoding() const {
1923 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1925 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1926 return MachineJumpTableInfo::EK_Custom32;
1928 // Otherwise, use the normal jump table encoding heuristics.
1929 return TargetLowering::getJumpTableEncoding();
1932 bool X86TargetLowering::useSoftFloat() const {
1933 return Subtarget.useSoftFloat();
1937 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1938 const MachineBasicBlock *MBB,
1939 unsigned uid,MCContext &Ctx) const{
1940 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1941 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1943 return MCSymbolRefExpr::create(MBB->getSymbol(),
1944 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1947 /// Returns relocation base for the given PIC jumptable.
1948 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1949 SelectionDAG &DAG) const {
1950 if (!Subtarget.is64Bit())
1951 // This doesn't have SDLoc associated with it, but is not really the
1952 // same as a Register.
1953 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1954 getPointerTy(DAG.getDataLayout()));
1958 /// This returns the relocation base for the given PIC jumptable,
1959 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1960 const MCExpr *X86TargetLowering::
1961 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1962 MCContext &Ctx) const {
1963 // X86-64 uses RIP relative addressing based on the jump table label.
1964 if (Subtarget.isPICStyleRIPRel())
1965 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1967 // Otherwise, the reference is relative to the PIC base.
1968 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1971 std::pair<const TargetRegisterClass *, uint8_t>
1972 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1974 const TargetRegisterClass *RRC = nullptr;
1976 switch (VT.SimpleTy) {
1978 return TargetLowering::findRepresentativeClass(TRI, VT);
1979 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1980 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1983 RRC = &X86::VR64RegClass;
1985 case MVT::f32: case MVT::f64:
1986 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1987 case MVT::v4f32: case MVT::v2f64:
1988 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1989 case MVT::v8f32: case MVT::v4f64:
1990 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1991 case MVT::v16f32: case MVT::v8f64:
1992 RRC = &X86::VR128XRegClass;
1995 return std::make_pair(RRC, Cost);
1998 unsigned X86TargetLowering::getAddressSpace() const {
1999 if (Subtarget.is64Bit())
2000 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2004 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2005 // glibc has a special slot for the stack guard in tcbhead_t, use it instead
2006 // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
2007 if (!Subtarget.isTargetGlibc())
2008 return TargetLowering::getIRStackGuard(IRB);
2010 // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
2012 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2013 unsigned AddressSpace = getAddressSpace();
2014 return ConstantExpr::getIntToPtr(
2015 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2016 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2019 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2020 // MSVC CRT provides functionalities for stack protection.
2021 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2022 // MSVC CRT has a global variable holding security cookie.
2023 M.getOrInsertGlobal("__security_cookie",
2024 Type::getInt8PtrTy(M.getContext()));
2026 // MSVC CRT has a function to validate security cookie.
2027 auto *SecurityCheckCookie = cast<Function>(
2028 M.getOrInsertFunction("__security_check_cookie",
2029 Type::getVoidTy(M.getContext()),
2030 Type::getInt8PtrTy(M.getContext()), nullptr));
2031 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2032 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2035 // glibc has a special slot for the stack guard.
2036 if (Subtarget.isTargetGlibc())
2038 TargetLowering::insertSSPDeclarations(M);
2041 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2042 // MSVC CRT has a global variable holding security cookie.
2043 if (Subtarget.getTargetTriple().isOSMSVCRT())
2044 return M.getGlobalVariable("__security_cookie");
2045 return TargetLowering::getSDagStackGuard(M);
2048 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2049 // MSVC CRT has a function to validate security cookie.
2050 if (Subtarget.getTargetTriple().isOSMSVCRT())
2051 return M.getFunction("__security_check_cookie");
2052 return TargetLowering::getSSPStackGuardCheck(M);
2055 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2056 if (Subtarget.getTargetTriple().isOSContiki())
2057 return getDefaultSafeStackPointerLocation(IRB, false);
2059 if (!Subtarget.isTargetAndroid())
2060 return TargetLowering::getSafeStackPointerLocation(IRB);
2062 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2063 // definition of TLS_SLOT_SAFESTACK in
2064 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2065 unsigned AddressSpace, Offset;
2067 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2069 Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2070 AddressSpace = getAddressSpace();
2071 return ConstantExpr::getIntToPtr(
2072 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2073 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2076 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2077 unsigned DestAS) const {
2078 assert(SrcAS != DestAS && "Expected different address spaces!");
2080 return SrcAS < 256 && DestAS < 256;
2083 //===----------------------------------------------------------------------===//
2084 // Return Value Calling Convention Implementation
2085 //===----------------------------------------------------------------------===//
2087 #include "X86GenCallingConv.inc"
2089 bool X86TargetLowering::CanLowerReturn(
2090 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2091 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2092 SmallVector<CCValAssign, 16> RVLocs;
2093 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2094 return CCInfo.CheckReturn(Outs, RetCC_X86);
2097 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2098 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2102 /// Lowers masks values (v*i1) to the local register values
2103 /// \returns DAG node after lowering to register type
2104 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2105 const SDLoc &Dl, SelectionDAG &DAG) {
2106 EVT ValVT = ValArg.getValueType();
2108 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2109 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2110 // Two stage lowering might be required
2111 // bitcast: v8i1 -> i8 / v16i1 -> i16
2112 // anyextend: i8 -> i32 / i16 -> i32
2113 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2114 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2115 if (ValLoc == MVT::i32)
2116 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2118 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2119 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2120 // One stage lowering is required
2121 // bitcast: v32i1 -> i32 / v64i1 -> i64
2122 return DAG.getBitcast(ValLoc, ValArg);
2124 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2127 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2128 static void Passv64i1ArgInRegs(
2129 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2130 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2131 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2132 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2133 "Expected AVX512BW or AVX512BMI target!");
2134 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2135 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2136 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2137 "The value should reside in two registers");
2139 // Before splitting the value we cast it to i64
2140 Arg = DAG.getBitcast(MVT::i64, Arg);
2142 // Splitting the value into two i32 types
2144 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2145 DAG.getConstant(0, Dl, MVT::i32));
2146 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2147 DAG.getConstant(1, Dl, MVT::i32));
2149 // Attach the two i32 types into corresponding registers
2150 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2151 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2155 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2157 const SmallVectorImpl<ISD::OutputArg> &Outs,
2158 const SmallVectorImpl<SDValue> &OutVals,
2159 const SDLoc &dl, SelectionDAG &DAG) const {
2160 MachineFunction &MF = DAG.getMachineFunction();
2161 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2163 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2164 report_fatal_error("X86 interrupts may not return any value");
2166 SmallVector<CCValAssign, 16> RVLocs;
2167 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2168 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2171 SmallVector<SDValue, 6> RetOps;
2172 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2173 // Operand #1 = Bytes To Pop
2174 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2177 // Copy the result values into the output registers.
2178 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2180 CCValAssign &VA = RVLocs[I];
2181 assert(VA.isRegLoc() && "Can only return in registers!");
2182 SDValue ValToCopy = OutVals[OutsIndex];
2183 EVT ValVT = ValToCopy.getValueType();
2185 // Promote values to the appropriate types.
2186 if (VA.getLocInfo() == CCValAssign::SExt)
2187 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2188 else if (VA.getLocInfo() == CCValAssign::ZExt)
2189 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2190 else if (VA.getLocInfo() == CCValAssign::AExt) {
2191 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2192 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2194 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2196 else if (VA.getLocInfo() == CCValAssign::BCvt)
2197 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2199 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2200 "Unexpected FP-extend for return value.");
2202 // If this is x86-64, and we disabled SSE, we can't return FP values,
2203 // or SSE or MMX vectors.
2204 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2205 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2206 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2207 report_fatal_error("SSE register return with SSE disabled");
2209 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2210 // llvm-gcc has never done it right and no one has noticed, so this
2211 // should be OK for now.
2212 if (ValVT == MVT::f64 &&
2213 (Subtarget.is64Bit() && !Subtarget.hasSSE2()))
2214 report_fatal_error("SSE2 register return with SSE2 disabled");
2216 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2217 // the RET instruction and handled by the FP Stackifier.
2218 if (VA.getLocReg() == X86::FP0 ||
2219 VA.getLocReg() == X86::FP1) {
2220 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2221 // change the value to the FP stack register class.
2222 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2223 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2224 RetOps.push_back(ValToCopy);
2225 // Don't emit a copytoreg.
2229 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2230 // which is returned in RAX / RDX.
2231 if (Subtarget.is64Bit()) {
2232 if (ValVT == MVT::x86mmx) {
2233 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2234 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2235 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2237 // If we don't have SSE2 available, convert to v4f32 so the generated
2238 // register is legal.
2239 if (!Subtarget.hasSSE2())
2240 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2245 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2247 if (VA.needsCustom()) {
2248 assert(VA.getValVT() == MVT::v64i1 &&
2249 "Currently the only custom case is when we split v64i1 to 2 regs");
2251 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2254 assert(2 == RegsToPass.size() &&
2255 "Expecting two registers after Pass64BitArgInRegs");
2257 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2260 // Add nodes to the DAG and add the values into the RetOps list
2261 for (auto &Reg : RegsToPass) {
2262 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2263 Flag = Chain.getValue(1);
2264 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2268 // Swift calling convention does not require we copy the sret argument
2269 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2271 // All x86 ABIs require that for returning structs by value we copy
2272 // the sret argument into %rax/%eax (depending on ABI) for the return.
2273 // We saved the argument into a virtual register in the entry block,
2274 // so now we copy the value out and into %rax/%eax.
2276 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2277 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2278 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2279 // either case FuncInfo->setSRetReturnReg() will have been called.
2280 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2281 // When we have both sret and another return value, we should use the
2282 // original Chain stored in RetOps[0], instead of the current Chain updated
2283 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2285 // For the case of sret and another return value, we have
2286 // Chain_0 at the function entry
2287 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2288 // If we use Chain_1 in getCopyFromReg, we will have
2289 // Val = getCopyFromReg(Chain_1)
2290 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2292 // getCopyToReg(Chain_0) will be glued together with
2293 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2294 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2295 // Data dependency from Unit B to Unit A due to usage of Val in
2296 // getCopyToReg(Chain_1, Val)
2297 // Chain dependency from Unit A to Unit B
2299 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2300 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2301 getPointerTy(MF.getDataLayout()));
2304 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2305 X86::RAX : X86::EAX;
2306 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2307 Flag = Chain.getValue(1);
2309 // RAX/EAX now acts like a return value.
2311 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2314 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2315 const MCPhysReg *I =
2316 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2319 if (X86::GR64RegClass.contains(*I))
2320 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2322 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2326 RetOps[0] = Chain; // Update chain.
2328 // Add the flag if we have it.
2330 RetOps.push_back(Flag);
2332 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2333 if (CallConv == CallingConv::X86_INTR)
2334 opcode = X86ISD::IRET;
2335 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2338 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2339 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2342 SDValue TCChain = Chain;
2343 SDNode *Copy = *N->use_begin();
2344 if (Copy->getOpcode() == ISD::CopyToReg) {
2345 // If the copy has a glue operand, we conservatively assume it isn't safe to
2346 // perform a tail call.
2347 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2349 TCChain = Copy->getOperand(0);
2350 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2353 bool HasRet = false;
2354 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2356 if (UI->getOpcode() != X86ISD::RET_FLAG)
2358 // If we are returning more than one value, we can definitely
2359 // not make a tail call see PR19530
2360 if (UI->getNumOperands() > 4)
2362 if (UI->getNumOperands() == 4 &&
2363 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2375 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2376 ISD::NodeType ExtendKind) const {
2377 MVT ReturnMVT = MVT::i32;
2379 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2380 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2381 // The ABI does not require i1, i8 or i16 to be extended.
2383 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2384 // always extending i8/i16 return values, so keep doing that for now.
2386 ReturnMVT = MVT::i8;
2389 EVT MinVT = getRegisterType(Context, ReturnMVT);
2390 return VT.bitsLT(MinVT) ? MinVT : VT;
2393 /// Reads two 32 bit registers and creates a 64 bit mask value.
2394 /// \param VA The current 32 bit value that need to be assigned.
2395 /// \param NextVA The next 32 bit value that need to be assigned.
2396 /// \param Root The parent DAG node.
2397 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2398 /// glue purposes. In the case the DAG is already using
2399 /// physical register instead of virtual, we should glue
2400 /// our new SDValue to InFlag SDvalue.
2401 /// \return a new SDvalue of size 64bit.
2402 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2403 SDValue &Root, SelectionDAG &DAG,
2404 const SDLoc &Dl, const X86Subtarget &Subtarget,
2405 SDValue *InFlag = nullptr) {
2406 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2407 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2408 assert(VA.getValVT() == MVT::v64i1 &&
2409 "Expecting first location of 64 bit width type");
2410 assert(NextVA.getValVT() == VA.getValVT() &&
2411 "The locations should have the same type");
2412 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2413 "The values should reside in two registers");
2417 SDValue ArgValueLo, ArgValueHi;
2419 MachineFunction &MF = DAG.getMachineFunction();
2420 const TargetRegisterClass *RC = &X86::GR32RegClass;
2422 // Read a 32 bit value from the registers
2423 if (nullptr == InFlag) {
2424 // When no physical register is present,
2425 // create an intermediate virtual register
2426 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2427 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2428 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2429 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2431 // When a physical register is available read the value from it and glue
2432 // the reads together.
2434 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2435 *InFlag = ArgValueLo.getValue(2);
2437 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2438 *InFlag = ArgValueHi.getValue(2);
2441 // Convert the i32 type into v32i1 type
2442 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2444 // Convert the i32 type into v32i1 type
2445 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2447 // Concantenate the two values together
2448 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2451 /// The function will lower a register of various sizes (8/16/32/64)
2452 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2453 /// \returns a DAG node contains the operand after lowering to mask type.
2454 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2455 const EVT &ValLoc, const SDLoc &Dl,
2456 SelectionDAG &DAG) {
2457 SDValue ValReturned = ValArg;
2459 if (ValVT == MVT::v64i1) {
2460 // In 32 bit machine, this case is handled by getv64i1Argument
2461 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2462 // In 64 bit machine, There is no need to truncate the value only bitcast
2465 switch (ValVT.getSimpleVT().SimpleTy) {
2476 llvm_unreachable("Expecting a vector of i1 types");
2479 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2482 return DAG.getBitcast(ValVT, ValReturned);
2485 /// Lower the result values of a call into the
2486 /// appropriate copies out of appropriate physical registers.
2488 SDValue X86TargetLowering::LowerCallResult(
2489 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2490 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2491 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2493 // Assign locations to each value returned by this call.
2494 SmallVector<CCValAssign, 16> RVLocs;
2495 bool Is64Bit = Subtarget.is64Bit();
2496 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2498 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2500 // Copy all of the result registers out of their specified physreg.
2501 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2503 CCValAssign &VA = RVLocs[I];
2504 EVT CopyVT = VA.getLocVT();
2506 // If this is x86-64, and we disabled SSE, we can't return FP values
2507 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2508 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2509 report_fatal_error("SSE register return with SSE disabled");
2512 // If we prefer to use the value in xmm registers, copy it out as f80 and
2513 // use a truncate to move it from fp stack reg to xmm reg.
2514 bool RoundAfterCopy = false;
2515 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2516 isScalarFPTypeInSSEReg(VA.getValVT())) {
2517 if (!Subtarget.hasX87())
2518 report_fatal_error("X87 register return with X87 disabled");
2520 RoundAfterCopy = (CopyVT != VA.getLocVT());
2524 if (VA.needsCustom()) {
2525 assert(VA.getValVT() == MVT::v64i1 &&
2526 "Currently the only custom case is when we split v64i1 to 2 regs");
2528 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2530 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2532 Val = Chain.getValue(0);
2533 InFlag = Chain.getValue(2);
2537 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2538 // This truncation won't change the value.
2539 DAG.getIntPtrConstant(1, dl));
2541 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2542 if (VA.getValVT().isVector() &&
2543 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2544 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2545 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2546 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2548 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2551 InVals.push_back(Val);
2557 //===----------------------------------------------------------------------===//
2558 // C & StdCall & Fast Calling Convention implementation
2559 //===----------------------------------------------------------------------===//
2560 // StdCall calling convention seems to be standard for many Windows' API
2561 // routines and around. It differs from C calling convention just a little:
2562 // callee should clean up the stack, not caller. Symbols should be also
2563 // decorated in some fancy way :) It doesn't support any vector arguments.
2564 // For info on fast calling convention see Fast Calling Convention (tail call)
2565 // implementation LowerX86_32FastCCCallTo.
2567 /// CallIsStructReturn - Determines whether a call uses struct return
2569 enum StructReturnType {
2574 static StructReturnType
2575 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2577 return NotStructReturn;
2579 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2580 if (!Flags.isSRet())
2581 return NotStructReturn;
2582 if (Flags.isInReg() || IsMCU)
2583 return RegStructReturn;
2584 return StackStructReturn;
2587 /// Determines whether a function uses struct return semantics.
2588 static StructReturnType
2589 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2591 return NotStructReturn;
2593 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2594 if (!Flags.isSRet())
2595 return NotStructReturn;
2596 if (Flags.isInReg() || IsMCU)
2597 return RegStructReturn;
2598 return StackStructReturn;
2601 /// Make a copy of an aggregate at address specified by "Src" to address
2602 /// "Dst" with size and alignment information specified by the specific
2603 /// parameter attribute. The copy will be passed as a byval function parameter.
2604 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2605 SDValue Chain, ISD::ArgFlagsTy Flags,
2606 SelectionDAG &DAG, const SDLoc &dl) {
2607 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2609 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2610 /*isVolatile*/false, /*AlwaysInline=*/true,
2611 /*isTailCall*/false,
2612 MachinePointerInfo(), MachinePointerInfo());
2615 /// Return true if the calling convention is one that we can guarantee TCO for.
2616 static bool canGuaranteeTCO(CallingConv::ID CC) {
2617 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2618 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2619 CC == CallingConv::HHVM);
2622 /// Return true if we might ever do TCO for calls with this calling convention.
2623 static bool mayTailCallThisCC(CallingConv::ID CC) {
2625 // C calling conventions:
2626 case CallingConv::C:
2627 case CallingConv::X86_64_Win64:
2628 case CallingConv::X86_64_SysV:
2629 // Callee pop conventions:
2630 case CallingConv::X86_ThisCall:
2631 case CallingConv::X86_StdCall:
2632 case CallingConv::X86_VectorCall:
2633 case CallingConv::X86_FastCall:
2636 return canGuaranteeTCO(CC);
2640 /// Return true if the function is being made into a tailcall target by
2641 /// changing its ABI.
2642 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2643 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2646 bool X86TargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
2648 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2649 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2653 CallingConv::ID CalleeCC = CS.getCallingConv();
2654 if (!mayTailCallThisCC(CalleeCC))
2661 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2662 const SmallVectorImpl<ISD::InputArg> &Ins,
2663 const SDLoc &dl, SelectionDAG &DAG,
2664 const CCValAssign &VA,
2665 MachineFrameInfo &MFI, unsigned i) const {
2666 // Create the nodes corresponding to a load from this parameter slot.
2667 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2668 bool AlwaysUseMutable = shouldGuaranteeTCO(
2669 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2670 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2673 // If value is passed by pointer we have address passed instead of the value
2674 // itself. No need to extend if the mask value and location share the same
2676 bool ExtendedInMem =
2677 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2678 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2680 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2681 ValVT = VA.getLocVT();
2683 ValVT = VA.getValVT();
2685 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2686 // taken by a return address.
2688 if (CallConv == CallingConv::X86_INTR) {
2689 const X86Subtarget& Subtarget =
2690 static_cast<const X86Subtarget&>(DAG.getSubtarget());
2691 // X86 interrupts may take one or two arguments.
2692 // On the stack there will be no return address as in regular call.
2693 // Offset of last argument need to be set to -4/-8 bytes.
2694 // Where offset of the first argument out of two, should be set to 0 bytes.
2695 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2698 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2699 // changed with more analysis.
2700 // In case of tail call optimization mark all arguments mutable. Since they
2701 // could be overwritten by lowering of arguments in case of a tail call.
2702 if (Flags.isByVal()) {
2703 unsigned Bytes = Flags.getByValSize();
2704 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2705 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2706 // Adjust SP offset of interrupt parameter.
2707 if (CallConv == CallingConv::X86_INTR) {
2708 MFI.setObjectOffset(FI, Offset);
2710 return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2712 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
2713 VA.getLocMemOffset(), isImmutable);
2715 // Set SExt or ZExt flag.
2716 if (VA.getLocInfo() == CCValAssign::ZExt) {
2717 MFI.setObjectZExt(FI, true);
2718 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2719 MFI.setObjectSExt(FI, true);
2722 // Adjust SP offset of interrupt parameter.
2723 if (CallConv == CallingConv::X86_INTR) {
2724 MFI.setObjectOffset(FI, Offset);
2727 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
2728 SDValue Val = DAG.getLoad(
2729 ValVT, dl, Chain, FIN,
2730 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2731 return ExtendedInMem ?
2732 DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
2736 // FIXME: Get this from tablegen.
2737 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2738 const X86Subtarget &Subtarget) {
2739 assert(Subtarget.is64Bit());
2741 if (Subtarget.isCallingConvWin64(CallConv)) {
2742 static const MCPhysReg GPR64ArgRegsWin64[] = {
2743 X86::RCX, X86::RDX, X86::R8, X86::R9
2745 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2748 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2749 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2751 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2754 // FIXME: Get this from tablegen.
2755 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2756 CallingConv::ID CallConv,
2757 const X86Subtarget &Subtarget) {
2758 assert(Subtarget.is64Bit());
2759 if (Subtarget.isCallingConvWin64(CallConv)) {
2760 // The XMM registers which might contain var arg parameters are shadowed
2761 // in their paired GPR. So we only need to save the GPR to their home
2763 // TODO: __vectorcall will change this.
2767 const Function *Fn = MF.getFunction();
2768 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2769 bool isSoftFloat = Subtarget.useSoftFloat();
2770 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2771 "SSE register cannot be used when SSE is disabled!");
2772 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2773 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2777 static const MCPhysReg XMMArgRegs64Bit[] = {
2778 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2779 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2781 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2784 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2785 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2786 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2787 return A.getValNo() < B.getValNo();
2791 SDValue X86TargetLowering::LowerFormalArguments(
2792 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2793 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2794 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2795 MachineFunction &MF = DAG.getMachineFunction();
2796 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2797 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2799 const Function *Fn = MF.getFunction();
2800 if (Fn->hasExternalLinkage() &&
2801 Subtarget.isTargetCygMing() &&
2802 Fn->getName() == "main")
2803 FuncInfo->setForceFramePointer(true);
2805 MachineFrameInfo &MFI = MF.getFrameInfo();
2806 bool Is64Bit = Subtarget.is64Bit();
2807 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2810 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2811 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2813 if (CallConv == CallingConv::X86_INTR) {
2814 bool isLegal = Ins.size() == 1 ||
2815 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2816 (!Is64Bit && Ins[1].VT == MVT::i32)));
2818 report_fatal_error("X86 interrupts may take one or two arguments");
2821 // Assign locations to all of the incoming arguments.
2822 SmallVector<CCValAssign, 16> ArgLocs;
2823 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2825 // Allocate shadow area for Win64.
2827 CCInfo.AllocateStack(32, 8);
2829 CCInfo.AnalyzeArguments(Ins, CC_X86);
2831 // In vectorcall calling convention a second pass is required for the HVA
2833 if (CallingConv::X86_VectorCall == CallConv) {
2834 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2837 // The next loop assumes that the locations are in the same order of the
2839 if (!isSortedByValueNo(ArgLocs))
2840 llvm_unreachable("Argument Location list must be sorted before lowering");
2843 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2845 assert(InsIndex < Ins.size() && "Invalid Ins index");
2846 CCValAssign &VA = ArgLocs[I];
2848 if (VA.isRegLoc()) {
2849 EVT RegVT = VA.getLocVT();
2850 if (VA.needsCustom()) {
2852 VA.getValVT() == MVT::v64i1 &&
2853 "Currently the only custom case is when we split v64i1 to 2 regs");
2855 // v64i1 values, in regcall calling convention, that are
2856 // compiled to 32 bit arch, are splited up into two registers.
2858 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2860 const TargetRegisterClass *RC;
2861 if (RegVT == MVT::i32)
2862 RC = &X86::GR32RegClass;
2863 else if (Is64Bit && RegVT == MVT::i64)
2864 RC = &X86::GR64RegClass;
2865 else if (RegVT == MVT::f32)
2866 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2867 else if (RegVT == MVT::f64)
2868 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2869 else if (RegVT == MVT::f80)
2870 RC = &X86::RFP80RegClass;
2871 else if (RegVT == MVT::f128)
2872 RC = &X86::FR128RegClass;
2873 else if (RegVT.is512BitVector())
2874 RC = &X86::VR512RegClass;
2875 else if (RegVT.is256BitVector())
2876 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2877 else if (RegVT.is128BitVector())
2878 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2879 else if (RegVT == MVT::x86mmx)
2880 RC = &X86::VR64RegClass;
2881 else if (RegVT == MVT::i1)
2882 RC = &X86::VK1RegClass;
2883 else if (RegVT == MVT::v8i1)
2884 RC = &X86::VK8RegClass;
2885 else if (RegVT == MVT::v16i1)
2886 RC = &X86::VK16RegClass;
2887 else if (RegVT == MVT::v32i1)
2888 RC = &X86::VK32RegClass;
2889 else if (RegVT == MVT::v64i1)
2890 RC = &X86::VK64RegClass;
2892 llvm_unreachable("Unknown argument type!");
2894 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2895 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2898 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2899 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2901 if (VA.getLocInfo() == CCValAssign::SExt)
2902 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2903 DAG.getValueType(VA.getValVT()));
2904 else if (VA.getLocInfo() == CCValAssign::ZExt)
2905 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
2906 DAG.getValueType(VA.getValVT()));
2907 else if (VA.getLocInfo() == CCValAssign::BCvt)
2908 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
2910 if (VA.isExtInLoc()) {
2911 // Handle MMX values passed in XMM regs.
2912 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
2913 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
2914 else if (VA.getValVT().isVector() &&
2915 VA.getValVT().getScalarType() == MVT::i1 &&
2916 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2917 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2918 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2919 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
2921 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
2924 assert(VA.isMemLoc());
2926 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
2929 // If value is passed via pointer - do a load.
2930 if (VA.getLocInfo() == CCValAssign::Indirect)
2932 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
2934 InVals.push_back(ArgValue);
2937 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
2938 // Swift calling convention does not require we copy the sret argument
2939 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
2940 if (CallConv == CallingConv::Swift)
2943 // All x86 ABIs require that for returning structs by value we copy the
2944 // sret argument into %rax/%eax (depending on ABI) for the return. Save
2945 // the argument into a virtual register so that we can access it from the
2947 if (Ins[I].Flags.isSRet()) {
2948 unsigned Reg = FuncInfo->getSRetReturnReg();
2950 MVT PtrTy = getPointerTy(DAG.getDataLayout());
2951 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
2952 FuncInfo->setSRetReturnReg(Reg);
2954 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
2955 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
2960 unsigned StackSize = CCInfo.getNextStackOffset();
2961 // Align stack specially for tail calls.
2962 if (shouldGuaranteeTCO(CallConv,
2963 MF.getTarget().Options.GuaranteedTailCallOpt))
2964 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
2966 // If the function takes variable number of arguments, make a frame index for
2967 // the start of the first vararg value... for expansion of llvm.va_start. We
2968 // can skip this if there are no va_start calls.
2969 if (MFI.hasVAStart() &&
2970 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
2971 CallConv != CallingConv::X86_ThisCall))) {
2972 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
2975 // Figure out if XMM registers are in use.
2976 assert(!(Subtarget.useSoftFloat() &&
2977 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
2978 "SSE register cannot be used when SSE is disabled!");
2980 // 64-bit calling conventions support varargs and register parameters, so we
2981 // have to do extra work to spill them in the prologue.
2982 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
2983 // Find the first unallocated argument registers.
2984 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
2985 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
2986 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
2987 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
2988 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
2989 "SSE register cannot be used when SSE is disabled!");
2991 // Gather all the live in physical registers.
2992 SmallVector<SDValue, 6> LiveGPRs;
2993 SmallVector<SDValue, 8> LiveXMMRegs;
2995 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
2996 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
2998 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3000 if (!ArgXMMs.empty()) {
3001 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3002 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3003 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3004 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3005 LiveXMMRegs.push_back(
3006 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3011 // Get to the caller-allocated home save location. Add 8 to account
3012 // for the return address.
3013 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3014 FuncInfo->setRegSaveFrameIndex(
3015 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3016 // Fixup to set vararg frame on shadow area (4 x i64).
3018 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3020 // For X86-64, if there are vararg parameters that are passed via
3021 // registers, then we must store them to their spots on the stack so
3022 // they may be loaded by dereferencing the result of va_next.
3023 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3024 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3025 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3026 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3029 // Store the integer parameter registers.
3030 SmallVector<SDValue, 8> MemOps;
3031 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3032 getPointerTy(DAG.getDataLayout()));
3033 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3034 for (SDValue Val : LiveGPRs) {
3035 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3036 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3038 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3039 MachinePointerInfo::getFixedStack(
3040 DAG.getMachineFunction(),
3041 FuncInfo->getRegSaveFrameIndex(), Offset));
3042 MemOps.push_back(Store);
3046 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3047 // Now store the XMM (fp + vector) parameter registers.
3048 SmallVector<SDValue, 12> SaveXMMOps;
3049 SaveXMMOps.push_back(Chain);
3050 SaveXMMOps.push_back(ALVal);
3051 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3052 FuncInfo->getRegSaveFrameIndex(), dl));
3053 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3054 FuncInfo->getVarArgsFPOffset(), dl));
3055 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3057 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3058 MVT::Other, SaveXMMOps));
3061 if (!MemOps.empty())
3062 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3065 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3066 // Find the largest legal vector type.
3067 MVT VecVT = MVT::Other;
3068 // FIXME: Only some x86_32 calling conventions support AVX512.
3069 if (Subtarget.hasAVX512() &&
3070 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3071 CallConv == CallingConv::Intel_OCL_BI)))
3072 VecVT = MVT::v16f32;
3073 else if (Subtarget.hasAVX())
3075 else if (Subtarget.hasSSE2())
3078 // We forward some GPRs and some vector types.
3079 SmallVector<MVT, 2> RegParmTypes;
3080 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3081 RegParmTypes.push_back(IntVT);
3082 if (VecVT != MVT::Other)
3083 RegParmTypes.push_back(VecVT);
3085 // Compute the set of forwarded registers. The rest are scratch.
3086 SmallVectorImpl<ForwardedRegister> &Forwards =
3087 FuncInfo->getForwardedMustTailRegParms();
3088 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3090 // Conservatively forward AL on x86_64, since it might be used for varargs.
3091 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3092 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3093 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3096 // Copy all forwards from physical to virtual registers.
3097 for (ForwardedRegister &F : Forwards) {
3098 // FIXME: Can we use a less constrained schedule?
3099 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3100 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3101 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3105 // Some CCs need callee pop.
3106 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3107 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3108 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3109 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3110 // X86 interrupts must pop the error code if present
3111 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
3113 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3114 // If this is an sret function, the return should pop the hidden pointer.
3115 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3116 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3117 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3118 FuncInfo->setBytesToPopOnReturn(4);
3122 // RegSaveFrameIndex is X86-64 only.
3123 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3124 if (CallConv == CallingConv::X86_FastCall ||
3125 CallConv == CallingConv::X86_ThisCall)
3126 // fastcc functions can't have varargs.
3127 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3130 FuncInfo->setArgumentStackSize(StackSize);
3132 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3133 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3134 if (Personality == EHPersonality::CoreCLR) {
3136 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3137 // that we'd prefer this slot be allocated towards the bottom of the frame
3138 // (i.e. near the stack pointer after allocating the frame). Every
3139 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3140 // offset from the bottom of this and each funclet's frame must be the
3141 // same, so the size of funclets' (mostly empty) frames is dictated by
3142 // how far this slot is from the bottom (since they allocate just enough
3143 // space to accommodate holding this slot at the correct offset).
3144 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3145 EHInfo->PSPSymFrameIdx = PSPSymFI;
3152 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3153 SDValue Arg, const SDLoc &dl,
3155 const CCValAssign &VA,
3156 ISD::ArgFlagsTy Flags) const {
3157 unsigned LocMemOffset = VA.getLocMemOffset();
3158 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3159 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3161 if (Flags.isByVal())
3162 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3164 return DAG.getStore(
3165 Chain, dl, Arg, PtrOff,
3166 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3169 /// Emit a load of return address if tail call
3170 /// optimization is performed and it is required.
3171 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3172 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3173 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3174 // Adjust the Return address stack slot.
3175 EVT VT = getPointerTy(DAG.getDataLayout());
3176 OutRetAddr = getReturnAddressFrameIndex(DAG);
3178 // Load the "old" Return address.
3179 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3180 return SDValue(OutRetAddr.getNode(), 1);
3183 /// Emit a store of the return address if tail call
3184 /// optimization is performed and it is required (FPDiff!=0).
3185 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3186 SDValue Chain, SDValue RetAddrFrIdx,
3187 EVT PtrVT, unsigned SlotSize,
3188 int FPDiff, const SDLoc &dl) {
3189 // Store the return address to the appropriate stack slot.
3190 if (!FPDiff) return Chain;
3191 // Calculate the new stack slot for the return address.
3192 int NewReturnAddrFI =
3193 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3195 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3196 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3197 MachinePointerInfo::getFixedStack(
3198 DAG.getMachineFunction(), NewReturnAddrFI));
3202 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3203 /// operation of specified width.
3204 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3206 unsigned NumElems = VT.getVectorNumElements();
3207 SmallVector<int, 8> Mask;
3208 Mask.push_back(NumElems);
3209 for (unsigned i = 1; i != NumElems; ++i)
3211 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3215 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3216 SmallVectorImpl<SDValue> &InVals) const {
3217 SelectionDAG &DAG = CLI.DAG;
3219 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3220 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3221 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3222 SDValue Chain = CLI.Chain;
3223 SDValue Callee = CLI.Callee;
3224 CallingConv::ID CallConv = CLI.CallConv;
3225 bool &isTailCall = CLI.IsTailCall;
3226 bool isVarArg = CLI.IsVarArg;
3228 MachineFunction &MF = DAG.getMachineFunction();
3229 bool Is64Bit = Subtarget.is64Bit();
3230 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3231 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3232 bool IsSibcall = false;
3233 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3234 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3236 if (CallConv == CallingConv::X86_INTR)
3237 report_fatal_error("X86 interrupts may not be called directly");
3239 if (Attr.getValueAsString() == "true")
3242 if (Subtarget.isPICStyleGOT() &&
3243 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3244 // If we are using a GOT, disable tail calls to external symbols with
3245 // default visibility. Tail calling such a symbol requires using a GOT
3246 // relocation, which forces early binding of the symbol. This breaks code
3247 // that require lazy function symbol resolution. Using musttail or
3248 // GuaranteedTailCallOpt will override this.
3249 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3250 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3251 G->getGlobal()->hasDefaultVisibility()))
3255 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3257 // Force this to be a tail call. The verifier rules are enough to ensure
3258 // that we can lower this successfully without moving the return address
3261 } else if (isTailCall) {
3262 // Check if it's really possible to do a tail call.
3263 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3264 isVarArg, SR != NotStructReturn,
3265 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3266 Outs, OutVals, Ins, DAG);
3268 // Sibcalls are automatically detected tailcalls which do not require
3270 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3277 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3278 "Var args not supported with calling convention fastcc, ghc or hipe");
3280 // Analyze operands of the call, assigning locations to each operand.
3281 SmallVector<CCValAssign, 16> ArgLocs;
3282 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3284 // Allocate shadow area for Win64.
3286 CCInfo.AllocateStack(32, 8);
3288 CCInfo.AnalyzeArguments(Outs, CC_X86);
3290 // In vectorcall calling convention a second pass is required for the HVA
3292 if (CallingConv::X86_VectorCall == CallConv) {
3293 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3296 // Get a count of how many bytes are to be pushed on the stack.
3297 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3299 // This is a sibcall. The memory operands are available in caller's
3300 // own caller's stack.
3302 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3303 canGuaranteeTCO(CallConv))
3304 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3307 if (isTailCall && !IsSibcall && !IsMustTail) {
3308 // Lower arguments at fp - stackoffset + fpdiff.
3309 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3311 FPDiff = NumBytesCallerPushed - NumBytes;
3313 // Set the delta of movement of the returnaddr stackslot.
3314 // But only set if delta is greater than previous delta.
3315 if (FPDiff < X86Info->getTCReturnAddrDelta())
3316 X86Info->setTCReturnAddrDelta(FPDiff);
3319 unsigned NumBytesToPush = NumBytes;
3320 unsigned NumBytesToPop = NumBytes;
3322 // If we have an inalloca argument, all stack space has already been allocated
3323 // for us and be right at the top of the stack. We don't support multiple
3324 // arguments passed in memory when using inalloca.
3325 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3327 if (!ArgLocs.back().isMemLoc())
3328 report_fatal_error("cannot use inalloca attribute on a register "
3330 if (ArgLocs.back().getLocMemOffset() != 0)
3331 report_fatal_error("any parameter with the inalloca attribute must be "
3332 "the only memory argument");
3336 Chain = DAG.getCALLSEQ_START(
3337 Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl);
3339 SDValue RetAddrFrIdx;
3340 // Load return address for tail calls.
3341 if (isTailCall && FPDiff)
3342 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3343 Is64Bit, FPDiff, dl);
3345 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3346 SmallVector<SDValue, 8> MemOpChains;
3349 // The next loop assumes that the locations are in the same order of the
3351 if (!isSortedByValueNo(ArgLocs))
3352 llvm_unreachable("Argument Location list must be sorted before lowering");
3354 // Walk the register/memloc assignments, inserting copies/loads. In the case
3355 // of tail call optimization arguments are handle later.
3356 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3357 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3359 assert(OutIndex < Outs.size() && "Invalid Out index");
3360 // Skip inalloca arguments, they have already been written.
3361 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3362 if (Flags.isInAlloca())
3365 CCValAssign &VA = ArgLocs[I];
3366 EVT RegVT = VA.getLocVT();
3367 SDValue Arg = OutVals[OutIndex];
3368 bool isByVal = Flags.isByVal();
3370 // Promote the value if needed.
3371 switch (VA.getLocInfo()) {
3372 default: llvm_unreachable("Unknown loc info!");
3373 case CCValAssign::Full: break;
3374 case CCValAssign::SExt:
3375 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3377 case CCValAssign::ZExt:
3378 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3380 case CCValAssign::AExt:
3381 if (Arg.getValueType().isVector() &&
3382 Arg.getValueType().getVectorElementType() == MVT::i1)
3383 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3384 else if (RegVT.is128BitVector()) {
3385 // Special case: passing MMX values in XMM registers.
3386 Arg = DAG.getBitcast(MVT::i64, Arg);
3387 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3388 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3390 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3392 case CCValAssign::BCvt:
3393 Arg = DAG.getBitcast(RegVT, Arg);
3395 case CCValAssign::Indirect: {
3396 // Store the argument.
3397 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3398 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3399 Chain = DAG.getStore(
3400 Chain, dl, Arg, SpillSlot,
3401 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3407 if (VA.needsCustom()) {
3408 assert(VA.getValVT() == MVT::v64i1 &&
3409 "Currently the only custom case is when we split v64i1 to 2 regs");
3410 // Split v64i1 value into two registers
3411 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3413 } else if (VA.isRegLoc()) {
3414 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3415 if (isVarArg && IsWin64) {
3416 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3417 // shadow reg if callee is a varargs function.
3418 unsigned ShadowReg = 0;
3419 switch (VA.getLocReg()) {
3420 case X86::XMM0: ShadowReg = X86::RCX; break;
3421 case X86::XMM1: ShadowReg = X86::RDX; break;
3422 case X86::XMM2: ShadowReg = X86::R8; break;
3423 case X86::XMM3: ShadowReg = X86::R9; break;
3426 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3428 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3429 assert(VA.isMemLoc());
3430 if (!StackPtr.getNode())
3431 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3432 getPointerTy(DAG.getDataLayout()));
3433 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3434 dl, DAG, VA, Flags));
3438 if (!MemOpChains.empty())
3439 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3441 if (Subtarget.isPICStyleGOT()) {
3442 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3445 RegsToPass.push_back(std::make_pair(
3446 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3447 getPointerTy(DAG.getDataLayout()))));
3449 // If we are tail calling and generating PIC/GOT style code load the
3450 // address of the callee into ECX. The value in ecx is used as target of
3451 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3452 // for tail calls on PIC/GOT architectures. Normally we would just put the
3453 // address of GOT into ebx and then call target@PLT. But for tail calls
3454 // ebx would be restored (since ebx is callee saved) before jumping to the
3457 // Note: The actual moving to ECX is done further down.
3458 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3459 if (G && !G->getGlobal()->hasLocalLinkage() &&
3460 G->getGlobal()->hasDefaultVisibility())
3461 Callee = LowerGlobalAddress(Callee, DAG);
3462 else if (isa<ExternalSymbolSDNode>(Callee))
3463 Callee = LowerExternalSymbol(Callee, DAG);
3467 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3468 // From AMD64 ABI document:
3469 // For calls that may call functions that use varargs or stdargs
3470 // (prototype-less calls or calls to functions containing ellipsis (...) in
3471 // the declaration) %al is used as hidden argument to specify the number
3472 // of SSE registers used. The contents of %al do not need to match exactly
3473 // the number of registers, but must be an ubound on the number of SSE
3474 // registers used and is in the range 0 - 8 inclusive.
3476 // Count the number of XMM registers allocated.
3477 static const MCPhysReg XMMArgRegs[] = {
3478 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3479 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3481 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3482 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3483 && "SSE registers cannot be used when SSE is disabled");
3485 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3486 DAG.getConstant(NumXMMRegs, dl,
3490 if (isVarArg && IsMustTail) {
3491 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3492 for (const auto &F : Forwards) {
3493 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3494 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3498 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3499 // don't need this because the eligibility check rejects calls that require
3500 // shuffling arguments passed in memory.
3501 if (!IsSibcall && isTailCall) {
3502 // Force all the incoming stack arguments to be loaded from the stack
3503 // before any new outgoing arguments are stored to the stack, because the
3504 // outgoing stack slots may alias the incoming argument stack slots, and
3505 // the alias isn't otherwise explicit. This is slightly more conservative
3506 // than necessary, because it means that each store effectively depends
3507 // on every argument instead of just those arguments it would clobber.
3508 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3510 SmallVector<SDValue, 8> MemOpChains2;
3513 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3515 CCValAssign &VA = ArgLocs[I];
3517 if (VA.isRegLoc()) {
3518 if (VA.needsCustom()) {
3519 assert((CallConv == CallingConv::X86_RegCall) &&
3520 "Expecting custome case only in regcall calling convention");
3521 // This means that we are in special case where one argument was
3522 // passed through two register locations - Skip the next location
3529 assert(VA.isMemLoc());
3530 SDValue Arg = OutVals[OutsIndex];
3531 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3532 // Skip inalloca arguments. They don't require any work.
3533 if (Flags.isInAlloca())
3535 // Create frame index.
3536 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3537 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3538 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3539 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3541 if (Flags.isByVal()) {
3542 // Copy relative to framepointer.
3543 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3544 if (!StackPtr.getNode())
3545 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3546 getPointerTy(DAG.getDataLayout()));
3547 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3550 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3554 // Store relative to framepointer.
3555 MemOpChains2.push_back(DAG.getStore(
3556 ArgChain, dl, Arg, FIN,
3557 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3561 if (!MemOpChains2.empty())
3562 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3564 // Store the return address to the appropriate stack slot.
3565 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3566 getPointerTy(DAG.getDataLayout()),
3567 RegInfo->getSlotSize(), FPDiff, dl);
3570 // Build a sequence of copy-to-reg nodes chained together with token chain
3571 // and flag operands which copy the outgoing args into registers.
3573 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3574 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3575 RegsToPass[i].second, InFlag);
3576 InFlag = Chain.getValue(1);
3579 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3580 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3581 // In the 64-bit large code model, we have to make all calls
3582 // through a register, since the call instruction's 32-bit
3583 // pc-relative offset may not be large enough to hold the whole
3585 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3586 // If the callee is a GlobalAddress node (quite common, every direct call
3587 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3589 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3591 // We should use extra load for direct calls to dllimported functions in
3593 const GlobalValue *GV = G->getGlobal();
3594 if (!GV->hasDLLImportStorageClass()) {
3595 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3597 Callee = DAG.getTargetGlobalAddress(
3598 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3600 if (OpFlags == X86II::MO_GOTPCREL) {
3602 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3603 getPointerTy(DAG.getDataLayout()), Callee);
3604 // Add extra indirection
3605 Callee = DAG.getLoad(
3606 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3607 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3610 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3611 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3612 unsigned char OpFlags =
3613 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3615 Callee = DAG.getTargetExternalSymbol(
3616 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3617 } else if (Subtarget.isTarget64BitILP32() &&
3618 Callee->getValueType(0) == MVT::i32) {
3619 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3620 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3623 // Returns a chain & a flag for retval copy to use.
3624 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3625 SmallVector<SDValue, 8> Ops;
3627 if (!IsSibcall && isTailCall) {
3628 Chain = DAG.getCALLSEQ_END(Chain,
3629 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3630 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3631 InFlag = Chain.getValue(1);
3634 Ops.push_back(Chain);
3635 Ops.push_back(Callee);
3638 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3640 // Add argument registers to the end of the list so that they are known live
3642 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3643 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3644 RegsToPass[i].second.getValueType()));
3646 // Add a register mask operand representing the call-preserved registers.
3647 const uint32_t *Mask = RegInfo->getCallPreservedMask(MF, CallConv);
3648 assert(Mask && "Missing call preserved mask for calling convention");
3650 // If this is an invoke in a 32-bit function using a funclet-based
3651 // personality, assume the function clobbers all registers. If an exception
3652 // is thrown, the runtime will not restore CSRs.
3653 // FIXME: Model this more precisely so that we can register allocate across
3654 // the normal edge and spill and fill across the exceptional edge.
3655 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3656 const Function *CallerFn = MF.getFunction();
3657 EHPersonality Pers =
3658 CallerFn->hasPersonalityFn()
3659 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3660 : EHPersonality::Unknown;
3661 if (isFuncletEHPersonality(Pers))
3662 Mask = RegInfo->getNoPreservedMask();
3665 Ops.push_back(DAG.getRegisterMask(Mask));
3667 if (InFlag.getNode())
3668 Ops.push_back(InFlag);
3672 //// If this is the first return lowered for this function, add the regs
3673 //// to the liveout set for the function.
3674 // This isn't right, although it's probably harmless on x86; liveouts
3675 // should be computed from returns not tail calls. Consider a void
3676 // function making a tail call to a function returning int.
3677 MF.getFrameInfo().setHasTailCall();
3678 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3681 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3682 InFlag = Chain.getValue(1);
3684 // Create the CALLSEQ_END node.
3685 unsigned NumBytesForCalleeToPop;
3686 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3687 DAG.getTarget().Options.GuaranteedTailCallOpt))
3688 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3689 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3690 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3691 SR == StackStructReturn)
3692 // If this is a call to a struct-return function, the callee
3693 // pops the hidden struct pointer, so we have to push it back.
3694 // This is common for Darwin/X86, Linux & Mingw32 targets.
3695 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3696 NumBytesForCalleeToPop = 4;
3698 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3700 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3701 // No need to reset the stack after the call if the call doesn't return. To
3702 // make the MI verify, we'll pretend the callee does it for us.
3703 NumBytesForCalleeToPop = NumBytes;
3706 // Returns a flag for retval copy to use.
3708 Chain = DAG.getCALLSEQ_END(Chain,
3709 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3710 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3713 InFlag = Chain.getValue(1);
3716 // Handle result values, copying them out of physregs into vregs that we
3718 return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
3719 Ins, dl, DAG, InVals);
3722 //===----------------------------------------------------------------------===//
3723 // Fast Calling Convention (tail call) implementation
3724 //===----------------------------------------------------------------------===//
3726 // Like std call, callee cleans arguments, convention except that ECX is
3727 // reserved for storing the tail called function address. Only 2 registers are
3728 // free for argument passing (inreg). Tail call optimization is performed
3730 // * tailcallopt is enabled
3731 // * caller/callee are fastcc
3732 // On X86_64 architecture with GOT-style position independent code only local
3733 // (within module) calls are supported at the moment.
3734 // To keep the stack aligned according to platform abi the function
3735 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3736 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3737 // If a tail called function callee has more arguments than the caller the
3738 // caller needs to make sure that there is room to move the RETADDR to. This is
3739 // achieved by reserving an area the size of the argument delta right after the
3740 // original RETADDR, but before the saved framepointer or the spilled registers
3741 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3753 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3756 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3757 SelectionDAG& DAG) const {
3758 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3759 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3760 unsigned StackAlignment = TFI.getStackAlignment();
3761 uint64_t AlignMask = StackAlignment - 1;
3762 int64_t Offset = StackSize;
3763 unsigned SlotSize = RegInfo->getSlotSize();
3764 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3765 // Number smaller than 12 so just add the difference.
3766 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3768 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3769 Offset = ((~AlignMask) & Offset) + StackAlignment +
3770 (StackAlignment-SlotSize);
3775 /// Return true if the given stack call argument is already available in the
3776 /// same position (relatively) of the caller's incoming argument stack.
3778 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3779 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3780 const X86InstrInfo *TII, const CCValAssign &VA) {
3781 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3784 // Look through nodes that don't alter the bits of the incoming value.
3785 unsigned Op = Arg.getOpcode();
3786 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3787 Arg = Arg.getOperand(0);
3790 if (Op == ISD::TRUNCATE) {
3791 const SDValue &TruncInput = Arg.getOperand(0);
3792 if (TruncInput.getOpcode() == ISD::AssertZext &&
3793 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3794 Arg.getValueType()) {
3795 Arg = TruncInput.getOperand(0);
3803 if (Arg.getOpcode() == ISD::CopyFromReg) {
3804 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3805 if (!TargetRegisterInfo::isVirtualRegister(VR))
3807 MachineInstr *Def = MRI->getVRegDef(VR);
3810 if (!Flags.isByVal()) {
3811 if (!TII->isLoadFromStackSlot(*Def, FI))
3814 unsigned Opcode = Def->getOpcode();
3815 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3816 Opcode == X86::LEA64_32r) &&
3817 Def->getOperand(1).isFI()) {
3818 FI = Def->getOperand(1).getIndex();
3819 Bytes = Flags.getByValSize();
3823 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3824 if (Flags.isByVal())
3825 // ByVal argument is passed in as a pointer but it's now being
3826 // dereferenced. e.g.
3827 // define @foo(%struct.X* %A) {
3828 // tail call @bar(%struct.X* byval %A)
3831 SDValue Ptr = Ld->getBasePtr();
3832 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3835 FI = FINode->getIndex();
3836 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3837 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3838 FI = FINode->getIndex();
3839 Bytes = Flags.getByValSize();
3843 assert(FI != INT_MAX);
3844 if (!MFI.isFixedObjectIndex(FI))
3847 if (Offset != MFI.getObjectOffset(FI))
3850 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3851 // If the argument location is wider than the argument type, check that any
3852 // extension flags match.
3853 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3854 Flags.isSExt() != MFI.isObjectSExt(FI)) {
3859 return Bytes == MFI.getObjectSize(FI);
3862 /// Check whether the call is eligible for tail call optimization. Targets
3863 /// that want to do tail call optimization should implement this function.
3864 bool X86TargetLowering::IsEligibleForTailCallOptimization(
3865 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
3866 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
3867 const SmallVectorImpl<ISD::OutputArg> &Outs,
3868 const SmallVectorImpl<SDValue> &OutVals,
3869 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
3870 if (!mayTailCallThisCC(CalleeCC))
3873 // If -tailcallopt is specified, make fastcc functions tail-callable.
3874 MachineFunction &MF = DAG.getMachineFunction();
3875 const Function *CallerF = MF.getFunction();
3877 // If the function return type is x86_fp80 and the callee return type is not,
3878 // then the FP_EXTEND of the call result is not a nop. It's not safe to
3879 // perform a tailcall optimization here.
3880 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
3883 CallingConv::ID CallerCC = CallerF->getCallingConv();
3884 bool CCMatch = CallerCC == CalleeCC;
3885 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
3886 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
3888 // Win64 functions have extra shadow space for argument homing. Don't do the
3889 // sibcall if the caller and callee have mismatched expectations for this
3891 if (IsCalleeWin64 != IsCallerWin64)
3894 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
3895 if (canGuaranteeTCO(CalleeCC) && CCMatch)
3900 // Look for obvious safe cases to perform tail call optimization that do not
3901 // require ABI changes. This is what gcc calls sibcall.
3903 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
3904 // emit a special epilogue.
3905 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3906 if (RegInfo->needsStackRealignment(MF))
3909 // Also avoid sibcall optimization if either caller or callee uses struct
3910 // return semantics.
3911 if (isCalleeStructRet || isCallerStructRet)
3914 // Do not sibcall optimize vararg calls unless all arguments are passed via
3916 LLVMContext &C = *DAG.getContext();
3917 if (isVarArg && !Outs.empty()) {
3918 // Optimizing for varargs on Win64 is unlikely to be safe without
3919 // additional testing.
3920 if (IsCalleeWin64 || IsCallerWin64)
3923 SmallVector<CCValAssign, 16> ArgLocs;
3924 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3926 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3927 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
3928 if (!ArgLocs[i].isRegLoc())
3932 // If the call result is in ST0 / ST1, it needs to be popped off the x87
3933 // stack. Therefore, if it's not used by the call it is not safe to optimize
3934 // this into a sibcall.
3935 bool Unused = false;
3936 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
3943 SmallVector<CCValAssign, 16> RVLocs;
3944 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
3945 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
3946 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
3947 CCValAssign &VA = RVLocs[i];
3948 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
3953 // Check that the call results are passed in the same way.
3954 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
3955 RetCC_X86, RetCC_X86))
3957 // The callee has to preserve all registers the caller needs to preserve.
3958 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
3959 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
3961 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
3962 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
3966 unsigned StackArgsSize = 0;
3968 // If the callee takes no arguments then go on to check the results of the
3970 if (!Outs.empty()) {
3971 // Check if stack adjustment is needed. For now, do not do this if any
3972 // argument is passed on the stack.
3973 SmallVector<CCValAssign, 16> ArgLocs;
3974 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
3976 // Allocate shadow area for Win64
3978 CCInfo.AllocateStack(32, 8);
3980 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
3981 StackArgsSize = CCInfo.getNextStackOffset();
3983 if (CCInfo.getNextStackOffset()) {
3984 // Check if the arguments are already laid out in the right way as
3985 // the caller's fixed stack objects.
3986 MachineFrameInfo &MFI = MF.getFrameInfo();
3987 const MachineRegisterInfo *MRI = &MF.getRegInfo();
3988 const X86InstrInfo *TII = Subtarget.getInstrInfo();
3989 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
3990 CCValAssign &VA = ArgLocs[i];
3991 SDValue Arg = OutVals[i];
3992 ISD::ArgFlagsTy Flags = Outs[i].Flags;
3993 if (VA.getLocInfo() == CCValAssign::Indirect)
3995 if (!VA.isRegLoc()) {
3996 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4003 bool PositionIndependent = isPositionIndependent();
4004 // If the tailcall address may be in a register, then make sure it's
4005 // possible to register allocate for it. In 32-bit, the call address can
4006 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4007 // callee-saved registers are restored. These happen to be the same
4008 // registers used to pass 'inreg' arguments so watch out for those.
4009 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4010 !isa<ExternalSymbolSDNode>(Callee)) ||
4011 PositionIndependent)) {
4012 unsigned NumInRegs = 0;
4013 // In PIC we need an extra register to formulate the address computation
4015 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4017 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4018 CCValAssign &VA = ArgLocs[i];
4021 unsigned Reg = VA.getLocReg();
4024 case X86::EAX: case X86::EDX: case X86::ECX:
4025 if (++NumInRegs == MaxInRegs)
4032 const MachineRegisterInfo &MRI = MF.getRegInfo();
4033 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4037 bool CalleeWillPop =
4038 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4039 MF.getTarget().Options.GuaranteedTailCallOpt);
4041 if (unsigned BytesToPop =
4042 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4043 // If we have bytes to pop, the callee must pop them.
4044 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4045 if (!CalleePopMatches)
4047 } else if (CalleeWillPop && StackArgsSize > 0) {
4048 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4056 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4057 const TargetLibraryInfo *libInfo) const {
4058 return X86::createFastISel(funcInfo, libInfo);
4061 //===----------------------------------------------------------------------===//
4062 // Other Lowering Hooks
4063 //===----------------------------------------------------------------------===//
4065 static bool MayFoldLoad(SDValue Op) {
4066 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4069 static bool MayFoldIntoStore(SDValue Op) {
4070 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4073 static bool MayFoldIntoZeroExtend(SDValue Op) {
4074 if (Op.hasOneUse()) {
4075 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4076 return (ISD::ZERO_EXTEND == Opcode);
4081 static bool isTargetShuffle(unsigned Opcode) {
4083 default: return false;
4084 case X86ISD::BLENDI:
4085 case X86ISD::PSHUFB:
4086 case X86ISD::PSHUFD:
4087 case X86ISD::PSHUFHW:
4088 case X86ISD::PSHUFLW:
4090 case X86ISD::INSERTPS:
4091 case X86ISD::PALIGNR:
4092 case X86ISD::VSHLDQ:
4093 case X86ISD::VSRLDQ:
4094 case X86ISD::MOVLHPS:
4095 case X86ISD::MOVLHPD:
4096 case X86ISD::MOVHLPS:
4097 case X86ISD::MOVLPS:
4098 case X86ISD::MOVLPD:
4099 case X86ISD::MOVSHDUP:
4100 case X86ISD::MOVSLDUP:
4101 case X86ISD::MOVDDUP:
4104 case X86ISD::UNPCKL:
4105 case X86ISD::UNPCKH:
4106 case X86ISD::VBROADCAST:
4107 case X86ISD::VPERMILPI:
4108 case X86ISD::VPERMILPV:
4109 case X86ISD::VPERM2X128:
4110 case X86ISD::VPERMIL2:
4111 case X86ISD::VPERMI:
4112 case X86ISD::VPPERM:
4113 case X86ISD::VPERMV:
4114 case X86ISD::VPERMV3:
4115 case X86ISD::VPERMIV3:
4116 case X86ISD::VZEXT_MOVL:
4121 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4123 default: return false;
4125 case X86ISD::PSHUFB:
4126 case X86ISD::VPERMILPV:
4127 case X86ISD::VPERMIL2:
4128 case X86ISD::VPPERM:
4129 case X86ISD::VPERMV:
4130 case X86ISD::VPERMV3:
4131 case X86ISD::VPERMIV3:
4133 // 'Faux' Target Shuffles.
4139 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4140 MachineFunction &MF = DAG.getMachineFunction();
4141 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4142 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4143 int ReturnAddrIndex = FuncInfo->getRAIndex();
4145 if (ReturnAddrIndex == 0) {
4146 // Set up a frame object for the return address.
4147 unsigned SlotSize = RegInfo->getSlotSize();
4148 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4151 FuncInfo->setRAIndex(ReturnAddrIndex);
4154 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4157 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4158 bool hasSymbolicDisplacement) {
4159 // Offset should fit into 32 bit immediate field.
4160 if (!isInt<32>(Offset))
4163 // If we don't have a symbolic displacement - we don't have any extra
4165 if (!hasSymbolicDisplacement)
4168 // FIXME: Some tweaks might be needed for medium code model.
4169 if (M != CodeModel::Small && M != CodeModel::Kernel)
4172 // For small code model we assume that latest object is 16MB before end of 31
4173 // bits boundary. We may also accept pretty large negative constants knowing
4174 // that all objects are in the positive half of address space.
4175 if (M == CodeModel::Small && Offset < 16*1024*1024)
4178 // For kernel code model we know that all object resist in the negative half
4179 // of 32bits address space. We may not accept negative offsets, since they may
4180 // be just off and we may accept pretty large positive ones.
4181 if (M == CodeModel::Kernel && Offset >= 0)
4187 /// Determines whether the callee is required to pop its own arguments.
4188 /// Callee pop is necessary to support tail calls.
4189 bool X86::isCalleePop(CallingConv::ID CallingConv,
4190 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4191 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4192 // can guarantee TCO.
4193 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4196 switch (CallingConv) {
4199 case CallingConv::X86_StdCall:
4200 case CallingConv::X86_FastCall:
4201 case CallingConv::X86_ThisCall:
4202 case CallingConv::X86_VectorCall:
4207 /// \brief Return true if the condition is an unsigned comparison operation.
4208 static bool isX86CCUnsigned(unsigned X86CC) {
4211 llvm_unreachable("Invalid integer condition!");
4227 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4228 switch (SetCCOpcode) {
4229 default: llvm_unreachable("Invalid integer condition!");
4230 case ISD::SETEQ: return X86::COND_E;
4231 case ISD::SETGT: return X86::COND_G;
4232 case ISD::SETGE: return X86::COND_GE;
4233 case ISD::SETLT: return X86::COND_L;
4234 case ISD::SETLE: return X86::COND_LE;
4235 case ISD::SETNE: return X86::COND_NE;
4236 case ISD::SETULT: return X86::COND_B;
4237 case ISD::SETUGT: return X86::COND_A;
4238 case ISD::SETULE: return X86::COND_BE;
4239 case ISD::SETUGE: return X86::COND_AE;
4243 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4244 /// condition code, returning the condition code and the LHS/RHS of the
4245 /// comparison to make.
4246 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4247 bool isFP, SDValue &LHS, SDValue &RHS,
4248 SelectionDAG &DAG) {
4250 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4251 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4252 // X > -1 -> X == 0, jump !sign.
4253 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4254 return X86::COND_NS;
4256 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4257 // X < 0 -> X == 0, jump on sign.
4260 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4262 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4263 return X86::COND_LE;
4267 return TranslateIntegerX86CC(SetCCOpcode);
4270 // First determine if it is required or is profitable to flip the operands.
4272 // If LHS is a foldable load, but RHS is not, flip the condition.
4273 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4274 !ISD::isNON_EXTLoad(RHS.getNode())) {
4275 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4276 std::swap(LHS, RHS);
4279 switch (SetCCOpcode) {
4285 std::swap(LHS, RHS);
4289 // On a floating point condition, the flags are set as follows:
4291 // 0 | 0 | 0 | X > Y
4292 // 0 | 0 | 1 | X < Y
4293 // 1 | 0 | 0 | X == Y
4294 // 1 | 1 | 1 | unordered
4295 switch (SetCCOpcode) {
4296 default: llvm_unreachable("Condcode should be pre-legalized away");
4298 case ISD::SETEQ: return X86::COND_E;
4299 case ISD::SETOLT: // flipped
4301 case ISD::SETGT: return X86::COND_A;
4302 case ISD::SETOLE: // flipped
4304 case ISD::SETGE: return X86::COND_AE;
4305 case ISD::SETUGT: // flipped
4307 case ISD::SETLT: return X86::COND_B;
4308 case ISD::SETUGE: // flipped
4310 case ISD::SETLE: return X86::COND_BE;
4312 case ISD::SETNE: return X86::COND_NE;
4313 case ISD::SETUO: return X86::COND_P;
4314 case ISD::SETO: return X86::COND_NP;
4316 case ISD::SETUNE: return X86::COND_INVALID;
4320 /// Is there a floating point cmov for the specific X86 condition code?
4321 /// Current x86 isa includes the following FP cmov instructions:
4322 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4323 static bool hasFPCMov(unsigned X86CC) {
4340 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4342 unsigned Intrinsic) const {
4344 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4348 Info.opc = ISD::INTRINSIC_W_CHAIN;
4349 Info.readMem = false;
4350 Info.writeMem = false;
4354 switch (IntrData->Type) {
4355 case EXPAND_FROM_MEM: {
4356 Info.ptrVal = I.getArgOperand(0);
4357 Info.memVT = MVT::getVT(I.getType());
4359 Info.readMem = true;
4362 case COMPRESS_TO_MEM: {
4363 Info.ptrVal = I.getArgOperand(0);
4364 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4366 Info.writeMem = true;
4369 case TRUNCATE_TO_MEM_VI8:
4370 case TRUNCATE_TO_MEM_VI16:
4371 case TRUNCATE_TO_MEM_VI32: {
4372 Info.ptrVal = I.getArgOperand(0);
4373 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4374 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4375 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4377 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4378 ScalarVT = MVT::i16;
4379 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4380 ScalarVT = MVT::i32;
4382 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4384 Info.writeMem = true;
4394 /// Returns true if the target can instruction select the
4395 /// specified FP immediate natively. If false, the legalizer will
4396 /// materialize the FP immediate as a load from a constant pool.
4397 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4398 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4399 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4405 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4406 ISD::LoadExtType ExtTy,
4408 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4409 // relocation target a movq or addq instruction: don't let the load shrink.
4410 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4411 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4412 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4413 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4417 /// \brief Returns true if it is beneficial to convert a load of a constant
4418 /// to just the constant itself.
4419 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4421 assert(Ty->isIntegerTy());
4423 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4424 if (BitSize == 0 || BitSize > 64)
4429 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4430 unsigned Index) const {
4431 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4434 return (Index == 0 || Index == ResVT.getVectorNumElements());
4437 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4438 // Speculate cttz only if we can directly use TZCNT.
4439 return Subtarget.hasBMI();
4442 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4443 // Speculate ctlz only if we can directly use LZCNT.
4444 return Subtarget.hasLZCNT();
4447 bool X86TargetLowering::isCtlzFast() const {
4448 return Subtarget.hasFastLZCNT();
4451 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4452 if (!Subtarget.hasBMI())
4455 // There are only 32-bit and 64-bit forms for 'andn'.
4456 EVT VT = Y.getValueType();
4457 if (VT != MVT::i32 && VT != MVT::i64)
4463 /// Val is the undef sentinel value or equal to the specified value.
4464 static bool isUndefOrEqual(int Val, int CmpVal) {
4465 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4468 /// Val is either the undef or zero sentinel value.
4469 static bool isUndefOrZero(int Val) {
4470 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4473 /// Return true if every element in Mask, beginning
4474 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4475 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4476 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4477 if (Mask[i] != SM_SentinelUndef)
4482 /// Return true if Val is undef or if its value falls within the
4483 /// specified range (L, H].
4484 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4485 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4488 /// Return true if every element in Mask is undef or if its value
4489 /// falls within the specified range (L, H].
4490 static bool isUndefOrInRange(ArrayRef<int> Mask,
4493 if (!isUndefOrInRange(M, Low, Hi))
4498 /// Return true if Val is undef, zero or if its value falls within the
4499 /// specified range (L, H].
4500 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4501 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4504 /// Return true if every element in Mask is undef, zero or if its value
4505 /// falls within the specified range (L, H].
4506 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4508 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4513 /// Return true if every element in Mask, beginning
4514 /// from position Pos and ending in Pos+Size, falls within the specified
4515 /// sequential range (Low, Low+Size]. or is undef.
4516 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4517 unsigned Pos, unsigned Size, int Low) {
4518 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4519 if (!isUndefOrEqual(Mask[i], Low))
4524 /// Return true if every element in Mask, beginning
4525 /// from position Pos and ending in Pos+Size, falls within the specified
4526 /// sequential range (Low, Low+Size], or is undef or is zero.
4527 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4528 unsigned Size, int Low) {
4529 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4530 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4535 /// Return true if every element in Mask, beginning
4536 /// from position Pos and ending in Pos+Size is undef or is zero.
4537 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4539 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4540 if (!isUndefOrZero(Mask[i]))
4545 /// \brief Helper function to test whether a shuffle mask could be
4546 /// simplified by widening the elements being shuffled.
4548 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4549 /// leaves it in an unspecified state.
4551 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4552 /// shuffle masks. The latter have the special property of a '-2' representing
4553 /// a zero-ed lane of a vector.
4554 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4555 SmallVectorImpl<int> &WidenedMask) {
4556 WidenedMask.assign(Mask.size() / 2, 0);
4557 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4558 // If both elements are undef, its trivial.
4559 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
4560 WidenedMask[i / 2] = SM_SentinelUndef;
4564 // Check for an undef mask and a mask value properly aligned to fit with
4565 // a pair of values. If we find such a case, use the non-undef mask's value.
4566 if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
4567 Mask[i + 1] % 2 == 1) {
4568 WidenedMask[i / 2] = Mask[i + 1] / 2;
4571 if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
4572 WidenedMask[i / 2] = Mask[i] / 2;
4576 // When zeroing, we need to spread the zeroing across both lanes to widen.
4577 if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
4578 if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
4579 (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
4580 WidenedMask[i / 2] = SM_SentinelZero;
4586 // Finally check if the two mask values are adjacent and aligned with
4588 if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
4589 Mask[i] + 1 == Mask[i + 1]) {
4590 WidenedMask[i / 2] = Mask[i] / 2;
4594 // Otherwise we can't safely widen the elements used in this shuffle.
4597 assert(WidenedMask.size() == Mask.size() / 2 &&
4598 "Incorrect size of mask after widening the elements!");
4603 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4604 /// mask index with the scaled sequential indices for an equivalent narrowed
4605 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4607 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4608 SmallVectorImpl<int> &ScaledMask) {
4609 assert(0 < Scale && "Unexpected scaling factor");
4610 int NumElts = Mask.size();
4611 ScaledMask.assign(NumElts * Scale, -1);
4613 for (int i = 0; i != NumElts; ++i) {
4616 // Repeat sentinel values in every mask element.
4618 for (int s = 0; s != Scale; ++s)
4619 ScaledMask[(Scale * i) + s] = M;
4623 // Scale mask element and increment across each mask element.
4624 for (int s = 0; s != Scale; ++s)
4625 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4629 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4630 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4631 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4632 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4633 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4636 // The index should be aligned on a vecWidth-bit boundary.
4638 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4640 MVT VT = N->getSimpleValueType(0);
4641 unsigned ElSize = VT.getScalarSizeInBits();
4642 bool Result = (Index * ElSize) % vecWidth == 0;
4647 /// Return true if the specified INSERT_SUBVECTOR
4648 /// operand specifies a subvector insert that is suitable for input to
4649 /// insertion of 128 or 256-bit subvectors
4650 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4651 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4652 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4654 // The index should be aligned on a vecWidth-bit boundary.
4656 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4658 MVT VT = N->getSimpleValueType(0);
4659 unsigned ElSize = VT.getScalarSizeInBits();
4660 bool Result = (Index * ElSize) % vecWidth == 0;
4665 bool X86::isVINSERT128Index(SDNode *N) {
4666 return isVINSERTIndex(N, 128);
4669 bool X86::isVINSERT256Index(SDNode *N) {
4670 return isVINSERTIndex(N, 256);
4673 bool X86::isVEXTRACT128Index(SDNode *N) {
4674 return isVEXTRACTIndex(N, 128);
4677 bool X86::isVEXTRACT256Index(SDNode *N) {
4678 return isVEXTRACTIndex(N, 256);
4681 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4682 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4683 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4684 "Illegal extract subvector for VEXTRACT");
4687 cast<ConstantSDNode>(N->getOperand(1).getNode())->getZExtValue();
4689 MVT VecVT = N->getOperand(0).getSimpleValueType();
4690 MVT ElVT = VecVT.getVectorElementType();
4692 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4693 return Index / NumElemsPerChunk;
4696 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4697 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4698 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4699 "Illegal insert subvector for VINSERT");
4702 cast<ConstantSDNode>(N->getOperand(2).getNode())->getZExtValue();
4704 MVT VecVT = N->getSimpleValueType(0);
4705 MVT ElVT = VecVT.getVectorElementType();
4707 unsigned NumElemsPerChunk = vecWidth / ElVT.getSizeInBits();
4708 return Index / NumElemsPerChunk;
4711 /// Return the appropriate immediate to extract the specified
4712 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4713 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4714 return getExtractVEXTRACTImmediate(N, 128);
4717 /// Return the appropriate immediate to extract the specified
4718 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4719 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4720 return getExtractVEXTRACTImmediate(N, 256);
4723 /// Return the appropriate immediate to insert at the specified
4724 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4725 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4726 return getInsertVINSERTImmediate(N, 128);
4729 /// Return the appropriate immediate to insert at the specified
4730 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4731 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4732 return getInsertVINSERTImmediate(N, 256);
4735 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4736 bool X86::isZeroNode(SDValue Elt) {
4737 return isNullConstant(Elt) || isNullFPConstant(Elt);
4740 // Build a vector of constants
4741 // Use an UNDEF node if MaskElt == -1.
4742 // Spilt 64-bit constants in the 32-bit mode.
4743 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4744 const SDLoc &dl, bool IsMask = false) {
4746 SmallVector<SDValue, 32> Ops;
4749 MVT ConstVecVT = VT;
4750 unsigned NumElts = VT.getVectorNumElements();
4751 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4752 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4753 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4757 MVT EltVT = ConstVecVT.getVectorElementType();
4758 for (unsigned i = 0; i < NumElts; ++i) {
4759 bool IsUndef = Values[i] < 0 && IsMask;
4760 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4761 DAG.getConstant(Values[i], dl, EltVT);
4762 Ops.push_back(OpNode);
4764 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4765 DAG.getConstant(0, dl, EltVT));
4767 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4769 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4773 static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
4774 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4775 assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
4776 SmallVector<SDValue, 32> Ops;
4779 MVT ConstVecVT = VT;
4780 unsigned NumElts = VT.getVectorNumElements();
4781 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4782 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4783 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4787 MVT EltVT = ConstVecVT.getVectorElementType();
4788 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4790 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4793 const APInt &V = Bits[i];
4794 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4796 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4797 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4798 } else if (EltVT == MVT::f32) {
4799 APFloat FV(APFloat::IEEEsingle(), V);
4800 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4801 } else if (EltVT == MVT::f64) {
4802 APFloat FV(APFloat::IEEEdouble(), V);
4803 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4805 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4809 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4810 return DAG.getBitcast(VT, ConstsNode);
4813 /// Returns a vector of specified type with all zero elements.
4814 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4815 SelectionDAG &DAG, const SDLoc &dl) {
4816 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4817 VT.getVectorElementType() == MVT::i1) &&
4818 "Unexpected vector type");
4820 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4821 // type. This ensures they get CSE'd. But if the integer type is not
4822 // available, use a floating-point +0.0 instead.
4824 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4825 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4826 } else if (VT.getVectorElementType() == MVT::i1) {
4827 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4828 "Unexpected vector type");
4829 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4830 "Unexpected vector type");
4831 Vec = DAG.getConstant(0, dl, VT);
4833 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4834 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4836 return DAG.getBitcast(VT, Vec);
4839 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4840 const SDLoc &dl, unsigned vectorWidth) {
4841 EVT VT = Vec.getValueType();
4842 EVT ElVT = VT.getVectorElementType();
4843 unsigned Factor = VT.getSizeInBits()/vectorWidth;
4844 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
4845 VT.getVectorNumElements()/Factor);
4847 // Extract from UNDEF is UNDEF.
4849 return DAG.getUNDEF(ResultVT);
4851 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
4852 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
4853 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4855 // This is the index of the first element of the vectorWidth-bit chunk
4856 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4857 IdxVal &= ~(ElemsPerChunk - 1);
4859 // If the input is a buildvector just emit a smaller one.
4860 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
4861 return DAG.getNode(ISD::BUILD_VECTOR, dl, ResultVT,
4862 makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
4864 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4865 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
4868 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
4869 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
4870 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
4871 /// instructions or a simple subregister reference. Idx is an index in the
4872 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
4873 /// lowering EXTRACT_VECTOR_ELT operations easier.
4874 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
4875 SelectionDAG &DAG, const SDLoc &dl) {
4876 assert((Vec.getValueType().is256BitVector() ||
4877 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
4878 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
4881 /// Generate a DAG to grab 256-bits from a 512-bit vector.
4882 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
4883 SelectionDAG &DAG, const SDLoc &dl) {
4884 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
4885 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
4888 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4889 SelectionDAG &DAG, const SDLoc &dl,
4890 unsigned vectorWidth) {
4891 assert((vectorWidth == 128 || vectorWidth == 256) &&
4892 "Unsupported vector width");
4893 // Inserting UNDEF is Result
4896 EVT VT = Vec.getValueType();
4897 EVT ElVT = VT.getVectorElementType();
4898 EVT ResultVT = Result.getValueType();
4900 // Insert the relevant vectorWidth bits.
4901 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
4902 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
4904 // This is the index of the first element of the vectorWidth-bit chunk
4905 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
4906 IdxVal &= ~(ElemsPerChunk - 1);
4908 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
4909 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
4912 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
4913 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
4914 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
4915 /// simple superregister reference. Idx is an index in the 128 bits
4916 /// we want. It need not be aligned to a 128-bit boundary. That makes
4917 /// lowering INSERT_VECTOR_ELT operations easier.
4918 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4919 SelectionDAG &DAG, const SDLoc &dl) {
4920 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
4922 // For insertion into the zero index (low half) of a 256-bit vector, it is
4923 // more efficient to generate a blend with immediate instead of an insert*128.
4924 // We are still creating an INSERT_SUBVECTOR below with an undef node to
4925 // extend the subvector to the size of the result vector. Make sure that
4926 // we are not recursing on that node by checking for undef here.
4927 if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
4928 !Result.isUndef()) {
4929 EVT ResultVT = Result.getValueType();
4930 SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
4931 SDValue Undef = DAG.getUNDEF(ResultVT);
4932 SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
4935 // The blend instruction, and therefore its mask, depend on the data type.
4936 MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
4937 if (ScalarType.isFloatingPoint()) {
4938 // Choose either vblendps (float) or vblendpd (double).
4939 unsigned ScalarSize = ScalarType.getSizeInBits();
4940 assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
4941 unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
4942 SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
4943 return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
4946 const X86Subtarget &Subtarget =
4947 static_cast<const X86Subtarget &>(DAG.getSubtarget());
4949 // AVX2 is needed for 256-bit integer blend support.
4950 // Integers must be cast to 32-bit because there is only vpblendd;
4951 // vpblendw can't be used for this because it has a handicapped mask.
4953 // If we don't have AVX2, then cast to float. Using a wrong domain blend
4954 // is still more efficient than using the wrong domain vinsertf128 that
4955 // will be created by InsertSubVector().
4956 MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
4958 SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
4959 Result = DAG.getBitcast(CastVT, Result);
4960 Vec256 = DAG.getBitcast(CastVT, Vec256);
4961 Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
4962 return DAG.getBitcast(ResultVT, Vec256);
4965 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
4968 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
4969 SelectionDAG &DAG, const SDLoc &dl) {
4970 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
4971 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
4974 /// Insert i1-subvector to i1-vector.
4975 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
4976 const X86Subtarget &Subtarget) {
4979 SDValue Vec = Op.getOperand(0);
4980 SDValue SubVec = Op.getOperand(1);
4981 SDValue Idx = Op.getOperand(2);
4983 if (!isa<ConstantSDNode>(Idx))
4986 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
4987 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
4990 MVT OpVT = Op.getSimpleValueType();
4991 MVT SubVecVT = SubVec.getSimpleValueType();
4992 unsigned NumElems = OpVT.getVectorNumElements();
4993 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
4995 assert(IdxVal + SubVecNumElems <= NumElems &&
4996 IdxVal % SubVecVT.getSizeInBits() == 0 &&
4997 "Unexpected index value in INSERT_SUBVECTOR");
4999 // There are 3 possible cases:
5000 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5001 // 2. Subvector should be inserted in the upper part
5002 // (IdxVal + SubVecNumElems == NumElems)
5003 // 3. Subvector should be inserted in the middle (for example v2i1
5004 // to v16i1, index 2)
5006 // extend to natively supported kshift
5007 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5008 MVT WideOpVT = OpVT;
5009 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5012 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5013 SDValue Undef = DAG.getUNDEF(WideOpVT);
5014 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5015 Undef, SubVec, ZeroIdx);
5017 // Extract sub-vector if require.
5018 auto ExtractSubVec = [&](SDValue V) {
5019 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5023 if (Vec.isUndef()) {
5025 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5026 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec, ShiftBits);
5028 return ExtractSubVec(WideSubVec);
5031 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5032 NumElems = WideOpVT.getVectorNumElements();
5033 unsigned ShiftLeft = NumElems - SubVecNumElems;
5034 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5035 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5036 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5037 Vec = ShiftRight ? DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec,
5038 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5039 return ExtractSubVec(Vec);
5043 // Zero lower bits of the Vec
5044 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5045 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5046 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5047 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5048 // Merge them together, SubVec should be zero extended.
5049 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5050 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5052 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5053 return ExtractSubVec(Vec);
5056 // Simple case when we put subvector in the upper part
5057 if (IdxVal + SubVecNumElems == NumElems) {
5058 // Zero upper bits of the Vec
5059 WideSubVec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, WideSubVec,
5060 DAG.getConstant(IdxVal, dl, MVT::i8));
5061 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5062 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5063 Vec = DAG.getNode(X86ISD::VSHLI, dl, WideOpVT, Vec, ShiftBits);
5064 Vec = DAG.getNode(X86ISD::VSRLI, dl, WideOpVT, Vec, ShiftBits);
5065 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5066 return ExtractSubVec(Vec);
5068 // Subvector should be inserted in the middle - use shuffle
5069 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5071 SmallVector<int, 64> Mask;
5072 for (unsigned i = 0; i < NumElems; ++i)
5073 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5075 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5078 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5079 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5080 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5081 /// large BUILD_VECTORS.
5082 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5083 unsigned NumElems, SelectionDAG &DAG,
5085 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5086 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5089 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5090 unsigned NumElems, SelectionDAG &DAG,
5092 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5093 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5096 /// Returns a vector of specified type with all bits set.
5097 /// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
5098 /// no AVX2 support, use two <4 x i32> inserted in a <8 x i32> appropriately.
5099 /// Then bitcast to their original type, ensuring they get CSE'd.
5100 static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
5101 SelectionDAG &DAG, const SDLoc &dl) {
5102 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5103 "Expected a 128/256/512-bit vector type");
5105 APInt Ones = APInt::getAllOnesValue(32);
5106 unsigned NumElts = VT.getSizeInBits() / 32;
5108 if (!Subtarget.hasInt256() && NumElts == 8) {
5109 Vec = DAG.getConstant(Ones, dl, MVT::v4i32);
5110 Vec = concat128BitVectors(Vec, Vec, MVT::v8i32, 8, DAG, dl);
5112 Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5114 return DAG.getBitcast(VT, Vec);
5117 /// Generate unpacklo/unpackhi shuffle mask.
5118 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5120 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5121 int NumElts = VT.getVectorNumElements();
5122 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5124 for (int i = 0; i < NumElts; ++i) {
5125 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5126 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5127 Pos += (Unary ? 0 : NumElts * (i % 2));
5128 Pos += (Lo ? 0 : NumEltsInLane / 2);
5129 Mask.push_back(Pos);
5133 /// Returns a vector_shuffle node for an unpackl operation.
5134 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5135 SDValue V1, SDValue V2) {
5136 SmallVector<int, 8> Mask;
5137 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5138 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5141 /// Returns a vector_shuffle node for an unpackh operation.
5142 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5143 SDValue V1, SDValue V2) {
5144 SmallVector<int, 8> Mask;
5145 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5146 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5149 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5150 /// This produces a shuffle where the low element of V2 is swizzled into the
5151 /// zero/undef vector, landing at element Idx.
5152 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5153 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5155 const X86Subtarget &Subtarget,
5156 SelectionDAG &DAG) {
5157 MVT VT = V2.getSimpleValueType();
5159 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5160 int NumElems = VT.getVectorNumElements();
5161 SmallVector<int, 16> MaskVec(NumElems);
5162 for (int i = 0; i != NumElems; ++i)
5163 // If this is the insertion idx, put the low elt of V2 here.
5164 MaskVec[i] = (i == Idx) ? NumElems : i;
5165 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5168 static SDValue peekThroughBitcasts(SDValue V) {
5169 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5170 V = V.getOperand(0);
5174 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5175 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5176 V.getOperand(0).hasOneUse())
5177 V = V.getOperand(0);
5181 static const Constant *getTargetConstantFromNode(SDValue Op) {
5182 Op = peekThroughBitcasts(Op);
5184 auto *Load = dyn_cast<LoadSDNode>(Op);
5188 SDValue Ptr = Load->getBasePtr();
5189 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5190 Ptr->getOpcode() == X86ISD::WrapperRIP)
5191 Ptr = Ptr->getOperand(0);
5193 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5194 if (!CNode || CNode->isMachineConstantPoolEntry())
5197 return dyn_cast<Constant>(CNode->getConstVal());
5200 // Extract raw constant bits from constant pools.
5201 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5202 SmallBitVector &UndefElts,
5203 SmallVectorImpl<APInt> &EltBits) {
5204 assert(UndefElts.empty() && "Expected an empty UndefElts vector");
5205 assert(EltBits.empty() && "Expected an empty EltBits vector");
5207 Op = peekThroughBitcasts(Op);
5209 EVT VT = Op.getValueType();
5210 unsigned SizeInBits = VT.getSizeInBits();
5211 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5212 unsigned NumElts = SizeInBits / EltSizeInBits;
5214 // Extract all the undef/constant element data and pack into single bitsets.
5215 APInt UndefBits(SizeInBits, 0);
5216 APInt MaskBits(SizeInBits, 0);
5218 // Split the undef/constant single bitset data into the target elements.
5219 auto SplitBitData = [&]() {
5220 UndefElts = SmallBitVector(NumElts, false);
5221 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5223 for (unsigned i = 0; i != NumElts; ++i) {
5224 APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
5225 UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
5227 // Only treat an element as UNDEF if all bits are UNDEF, otherwise
5228 // treat it as zero.
5229 if (UndefEltBits.isAllOnesValue()) {
5230 UndefElts[i] = true;
5234 APInt Bits = MaskBits.lshr(i * EltSizeInBits);
5235 Bits = Bits.zextOrTrunc(EltSizeInBits);
5236 EltBits[i] = Bits.getZExtValue();
5241 auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
5245 unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
5246 if (isa<UndefValue>(Cst)) {
5247 Mask = APInt::getNullValue(SizeInBits);
5248 Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
5251 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5252 Mask = CInt->getValue().zextOrTrunc(SizeInBits);
5253 Undefs = APInt::getNullValue(SizeInBits);
5256 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5257 Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
5258 Undefs = APInt::getNullValue(SizeInBits);
5264 // Extract constant bits from constant pool vector.
5265 if (auto *Cst = getTargetConstantFromNode(Op)) {
5266 Type *CstTy = Cst->getType();
5267 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5270 unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
5271 for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
5273 if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
5275 MaskBits |= Bits.shl(i * CstEltSizeInBits);
5276 UndefBits |= Undefs.shl(i * CstEltSizeInBits);
5279 return SplitBitData();
5282 // Extract constant bits from a broadcasted constant pool scalar.
5283 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5284 EltSizeInBits <= Op.getScalarValueSizeInBits()) {
5285 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5287 if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
5288 unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
5289 unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
5290 for (unsigned i = 0; i != NumBroadcastElts; ++i) {
5291 MaskBits |= Bits.shl(i * NumBroadcastBits);
5292 UndefBits |= Undefs.shl(i * NumBroadcastBits);
5294 return SplitBitData();
5302 // TODO: Merge more of this with getTargetConstantBitsFromNode.
5303 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5304 unsigned MaskEltSizeInBits,
5305 SmallVectorImpl<uint64_t> &RawMask) {
5306 MaskNode = peekThroughBitcasts(MaskNode);
5308 MVT VT = MaskNode.getSimpleValueType();
5309 assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
5310 unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
5312 // Split an APInt element into MaskEltSizeInBits sized pieces and
5313 // insert into the shuffle mask.
5314 auto SplitElementToMask = [&](APInt Element) {
5315 // Note that this is x86 and so always little endian: the low byte is
5316 // the first byte of the mask.
5317 int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5318 for (int i = 0; i < Split; ++i) {
5319 APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
5320 Element = Element.lshr(MaskEltSizeInBits);
5321 RawMask.push_back(RawElt.getZExtValue());
5325 if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
5326 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5327 // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
5328 if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
5330 if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
5331 const APInt &MaskElement = CN->getAPIntValue();
5332 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
5333 APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
5334 RawMask.push_back(RawElt.getZExtValue());
5340 if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
5341 MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
5342 SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
5343 if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
5344 if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
5345 RawMask.push_back(CN->getZExtValue());
5346 RawMask.append(NumMaskElts - 1, 0);
5350 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
5351 unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
5352 SplitElementToMask(CN->getAPIntValue());
5353 RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
5360 if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
5363 // We can always decode if the buildvector is all zero constants,
5364 // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
5365 if (all_of(MaskNode->ops(), X86::isZeroNode)) {
5366 RawMask.append(NumMaskElts, 0);
5370 // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
5371 if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
5374 for (SDValue Op : MaskNode->ops()) {
5375 if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
5376 SplitElementToMask(CN->getAPIntValue());
5377 else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
5378 SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
5386 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5387 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5388 /// operands in \p Ops, and returns true.
5389 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5390 /// IsUnary for shuffles which use a single input multiple times, and in those
5391 /// cases it will adjust the mask to only have indices within that single input.
5392 /// It is an error to call this with non-empty Mask/Ops vectors.
5393 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5394 SmallVectorImpl<SDValue> &Ops,
5395 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5396 unsigned NumElems = VT.getVectorNumElements();
5399 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5400 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5403 bool IsFakeUnary = false;
5404 switch(N->getOpcode()) {
5405 case X86ISD::BLENDI:
5406 ImmN = N->getOperand(N->getNumOperands()-1);
5407 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5410 ImmN = N->getOperand(N->getNumOperands()-1);
5411 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5412 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5414 case X86ISD::INSERTPS:
5415 ImmN = N->getOperand(N->getNumOperands()-1);
5416 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5417 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5419 case X86ISD::UNPCKH:
5420 DecodeUNPCKHMask(VT, Mask);
5421 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5423 case X86ISD::UNPCKL:
5424 DecodeUNPCKLMask(VT, Mask);
5425 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5427 case X86ISD::MOVHLPS:
5428 DecodeMOVHLPSMask(NumElems, Mask);
5429 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5431 case X86ISD::MOVLHPS:
5432 DecodeMOVLHPSMask(NumElems, Mask);
5433 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5435 case X86ISD::PALIGNR:
5436 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5437 ImmN = N->getOperand(N->getNumOperands()-1);
5438 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5439 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5440 Ops.push_back(N->getOperand(1));
5441 Ops.push_back(N->getOperand(0));
5443 case X86ISD::VSHLDQ:
5444 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5445 ImmN = N->getOperand(N->getNumOperands() - 1);
5446 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5449 case X86ISD::VSRLDQ:
5450 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5451 ImmN = N->getOperand(N->getNumOperands() - 1);
5452 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5455 case X86ISD::PSHUFD:
5456 case X86ISD::VPERMILPI:
5457 ImmN = N->getOperand(N->getNumOperands()-1);
5458 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5461 case X86ISD::PSHUFHW:
5462 ImmN = N->getOperand(N->getNumOperands()-1);
5463 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5466 case X86ISD::PSHUFLW:
5467 ImmN = N->getOperand(N->getNumOperands()-1);
5468 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5471 case X86ISD::VZEXT_MOVL:
5472 DecodeZeroMoveLowMask(VT, Mask);
5475 case X86ISD::VBROADCAST: {
5476 // We only decode broadcasts of same-sized vectors at the moment.
5477 if (N->getOperand(0).getValueType() == VT) {
5478 DecodeVectorBroadcast(VT, Mask);
5484 case X86ISD::VPERMILPV: {
5486 SDValue MaskNode = N->getOperand(1);
5487 unsigned MaskEltSize = VT.getScalarSizeInBits();
5488 SmallVector<uint64_t, 32> RawMask;
5489 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5490 DecodeVPERMILPMask(VT, RawMask, Mask);
5493 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5494 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5499 case X86ISD::PSHUFB: {
5501 SDValue MaskNode = N->getOperand(1);
5502 SmallVector<uint64_t, 32> RawMask;
5503 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5504 DecodePSHUFBMask(RawMask, Mask);
5507 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5508 DecodePSHUFBMask(C, Mask);
5513 case X86ISD::VPERMI:
5514 ImmN = N->getOperand(N->getNumOperands()-1);
5515 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5520 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5522 case X86ISD::VPERM2X128:
5523 ImmN = N->getOperand(N->getNumOperands()-1);
5524 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5525 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5527 case X86ISD::MOVSLDUP:
5528 DecodeMOVSLDUPMask(VT, Mask);
5531 case X86ISD::MOVSHDUP:
5532 DecodeMOVSHDUPMask(VT, Mask);
5535 case X86ISD::MOVDDUP:
5536 DecodeMOVDDUPMask(VT, Mask);
5539 case X86ISD::MOVLHPD:
5540 case X86ISD::MOVLPD:
5541 case X86ISD::MOVLPS:
5542 // Not yet implemented
5544 case X86ISD::VPERMIL2: {
5545 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5546 unsigned MaskEltSize = VT.getScalarSizeInBits();
5547 SDValue MaskNode = N->getOperand(2);
5548 SDValue CtrlNode = N->getOperand(3);
5549 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5550 unsigned CtrlImm = CtrlOp->getZExtValue();
5551 SmallVector<uint64_t, 32> RawMask;
5552 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5553 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5556 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5557 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5563 case X86ISD::VPPERM: {
5564 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5565 SDValue MaskNode = N->getOperand(2);
5566 SmallVector<uint64_t, 32> RawMask;
5567 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5568 DecodeVPPERMMask(RawMask, Mask);
5571 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5572 DecodeVPPERMMask(C, Mask);
5577 case X86ISD::VPERMV: {
5579 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5580 Ops.push_back(N->getOperand(1));
5581 SDValue MaskNode = N->getOperand(0);
5582 SmallVector<uint64_t, 32> RawMask;
5583 unsigned MaskEltSize = VT.getScalarSizeInBits();
5584 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5585 DecodeVPERMVMask(RawMask, Mask);
5588 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5589 DecodeVPERMVMask(C, MaskEltSize, Mask);
5594 case X86ISD::VPERMV3: {
5595 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5596 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5597 Ops.push_back(N->getOperand(0));
5598 Ops.push_back(N->getOperand(2));
5599 SDValue MaskNode = N->getOperand(1);
5600 unsigned MaskEltSize = VT.getScalarSizeInBits();
5601 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5602 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5607 case X86ISD::VPERMIV3: {
5608 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5609 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5610 Ops.push_back(N->getOperand(1));
5611 Ops.push_back(N->getOperand(2));
5612 SDValue MaskNode = N->getOperand(0);
5613 unsigned MaskEltSize = VT.getScalarSizeInBits();
5614 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5615 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5620 default: llvm_unreachable("unknown target shuffle node");
5623 // Empty mask indicates the decode failed.
5627 // Check if we're getting a shuffle mask with zero'd elements.
5628 if (!AllowSentinelZero)
5629 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5632 // If we have a fake unary shuffle, the shuffle mask is spread across two
5633 // inputs that are actually the same node. Re-map the mask to always point
5634 // into the first input.
5637 if (M >= (int)Mask.size())
5640 // If we didn't already add operands in the opcode-specific code, default to
5641 // adding 1 or 2 operands starting at 0.
5643 Ops.push_back(N->getOperand(0));
5644 if (!IsUnary || IsFakeUnary)
5645 Ops.push_back(N->getOperand(1));
5651 /// Check a target shuffle mask's inputs to see if we can set any values to
5652 /// SM_SentinelZero - this is for elements that are known to be zero
5653 /// (not just zeroable) from their inputs.
5654 /// Returns true if the target shuffle mask was decoded.
5655 static bool setTargetShuffleZeroElements(SDValue N,
5656 SmallVectorImpl<int> &Mask,
5657 SmallVectorImpl<SDValue> &Ops) {
5659 if (!isTargetShuffle(N.getOpcode()))
5662 MVT VT = N.getSimpleValueType();
5663 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5666 SDValue V1 = Ops[0];
5667 SDValue V2 = IsUnary ? V1 : Ops[1];
5669 V1 = peekThroughBitcasts(V1);
5670 V2 = peekThroughBitcasts(V2);
5672 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5675 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5679 // Determine shuffle input and normalize the mask.
5680 SDValue V = M < Size ? V1 : V2;
5683 // We are referencing an UNDEF input.
5685 Mask[i] = SM_SentinelUndef;
5689 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
5690 if (V.getOpcode() != ISD::BUILD_VECTOR)
5693 // If the BUILD_VECTOR has fewer elements then the (larger) source
5694 // element must be UNDEF/ZERO.
5695 // TODO: Is it worth testing the individual bits of a constant?
5696 if ((Size % V.getNumOperands()) == 0) {
5697 int Scale = Size / V->getNumOperands();
5698 SDValue Op = V.getOperand(M / Scale);
5700 Mask[i] = SM_SentinelUndef;
5701 else if (X86::isZeroNode(Op))
5702 Mask[i] = SM_SentinelZero;
5706 // If the BUILD_VECTOR has more elements then all the (smaller) source
5707 // elements must be all UNDEF or all ZERO.
5708 if ((V.getNumOperands() % Size) == 0) {
5709 int Scale = V->getNumOperands() / Size;
5710 bool AllUndef = true;
5711 bool AllZero = true;
5712 for (int j = 0; j < Scale; ++j) {
5713 SDValue Op = V.getOperand((M * Scale) + j);
5714 AllUndef &= Op.isUndef();
5715 AllZero &= X86::isZeroNode(Op);
5718 Mask[i] = SM_SentinelUndef;
5720 Mask[i] = SM_SentinelZero;
5725 assert(VT.getVectorNumElements() == Mask.size() &&
5726 "Different mask size from vector size!");
5730 // Attempt to decode ops that could be represented as a shuffle mask.
5731 // The decoded shuffle mask may contain a different number of elements to the
5732 // destination value type.
5733 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5734 SmallVectorImpl<SDValue> &Ops) {
5738 MVT VT = N.getSimpleValueType();
5739 unsigned NumElts = VT.getVectorNumElements();
5740 unsigned NumSizeInBits = VT.getSizeInBits();
5741 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5742 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5743 "Expected byte aligned value types");
5745 unsigned Opcode = N.getOpcode();
5748 // Attempt to decode as a per-byte mask.
5749 SmallBitVector UndefElts;
5750 SmallVector<APInt, 32> EltBits;
5751 if (!getTargetConstantBitsFromNode(N.getOperand(1), 8, UndefElts, EltBits))
5753 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5755 Mask.push_back(SM_SentinelUndef);
5758 uint64_t ByteBits = EltBits[i].getZExtValue();
5759 if (ByteBits != 0 && ByteBits != 255)
5761 Mask.push_back(ByteBits == 0 ? SM_SentinelZero : i);
5763 Ops.push_back(N.getOperand(0));
5767 case X86ISD::VSRLI: {
5768 uint64_t ShiftVal = N.getConstantOperandVal(1);
5769 // Out of range bit shifts are guaranteed to be zero.
5770 if (NumBitsPerElt <= ShiftVal) {
5771 Mask.append(NumElts, SM_SentinelZero);
5775 // We can only decode 'whole byte' bit shifts as shuffles.
5776 if ((ShiftVal % 8) != 0)
5779 uint64_t ByteShift = ShiftVal / 8;
5780 unsigned NumBytes = NumSizeInBits / 8;
5781 unsigned NumBytesPerElt = NumBitsPerElt / 8;
5782 Ops.push_back(N.getOperand(0));
5784 // Clear mask to all zeros and insert the shifted byte indices.
5785 Mask.append(NumBytes, SM_SentinelZero);
5787 if (X86ISD::VSHLI == Opcode) {
5788 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5789 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5790 Mask[i + j] = i + j - ByteShift;
5792 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
5793 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
5794 Mask[i + j - ByteShift] = i + j;
5798 case X86ISD::VZEXT: {
5799 // TODO - add support for VPMOVZX with smaller input vector types.
5800 SDValue Src = N.getOperand(0);
5801 MVT SrcVT = Src.getSimpleValueType();
5802 if (NumSizeInBits != SrcVT.getSizeInBits())
5804 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
5813 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
5814 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
5815 /// remaining input indices in case we now have a unary shuffle and adjust the
5816 /// Op0/Op1 inputs accordingly.
5817 /// Returns true if the target shuffle mask was decoded.
5818 static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1,
5819 SmallVectorImpl<int> &Mask) {
5820 SmallVector<SDValue, 2> Ops;
5821 if (!setTargetShuffleZeroElements(Op, Mask, Ops))
5822 if (!getFauxShuffleMask(Op, Mask, Ops))
5825 int NumElts = Mask.size();
5826 bool Op0InUse = any_of(Mask, [NumElts](int Idx) {
5827 return 0 <= Idx && Idx < NumElts;
5829 bool Op1InUse = any_of(Mask, [NumElts](int Idx) { return NumElts <= Idx; });
5831 Op0 = Op0InUse ? Ops[0] : SDValue();
5832 Op1 = Op1InUse ? Ops[1] : SDValue();
5834 // We're only using Op1 - commute the mask and inputs.
5835 if (!Op0InUse && Op1InUse) {
5846 /// Returns the scalar element that will make up the ith
5847 /// element of the result of the vector shuffle.
5848 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
5851 return SDValue(); // Limit search depth.
5853 SDValue V = SDValue(N, 0);
5854 EVT VT = V.getValueType();
5855 unsigned Opcode = V.getOpcode();
5857 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
5858 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
5859 int Elt = SV->getMaskElt(Index);
5862 return DAG.getUNDEF(VT.getVectorElementType());
5864 unsigned NumElems = VT.getVectorNumElements();
5865 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
5866 : SV->getOperand(1);
5867 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
5870 // Recurse into target specific vector shuffles to find scalars.
5871 if (isTargetShuffle(Opcode)) {
5872 MVT ShufVT = V.getSimpleValueType();
5873 MVT ShufSVT = ShufVT.getVectorElementType();
5874 int NumElems = (int)ShufVT.getVectorNumElements();
5875 SmallVector<int, 16> ShuffleMask;
5876 SmallVector<SDValue, 16> ShuffleOps;
5879 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
5882 int Elt = ShuffleMask[Index];
5883 if (Elt == SM_SentinelZero)
5884 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
5885 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
5886 if (Elt == SM_SentinelUndef)
5887 return DAG.getUNDEF(ShufSVT);
5889 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
5890 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
5891 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
5895 // Actual nodes that may contain scalar elements
5896 if (Opcode == ISD::BITCAST) {
5897 V = V.getOperand(0);
5898 EVT SrcVT = V.getValueType();
5899 unsigned NumElems = VT.getVectorNumElements();
5901 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
5905 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
5906 return (Index == 0) ? V.getOperand(0)
5907 : DAG.getUNDEF(VT.getVectorElementType());
5909 if (V.getOpcode() == ISD::BUILD_VECTOR)
5910 return V.getOperand(Index);
5915 /// Custom lower build_vector of v16i8.
5916 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
5917 unsigned NumNonZero, unsigned NumZero,
5919 const X86Subtarget &Subtarget,
5920 const TargetLowering &TLI) {
5928 // SSE4.1 - use PINSRB to insert each byte directly.
5929 if (Subtarget.hasSSE41()) {
5930 for (unsigned i = 0; i < 16; ++i) {
5931 bool isNonZero = (NonZeros & (1 << i)) != 0;
5935 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
5937 V = DAG.getUNDEF(MVT::v16i8);
5940 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
5941 MVT::v16i8, V, Op.getOperand(i),
5942 DAG.getIntPtrConstant(i, dl));
5949 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
5950 for (unsigned i = 0; i < 16; ++i) {
5951 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
5952 if (ThisIsNonZero && First) {
5954 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
5956 V = DAG.getUNDEF(MVT::v8i16);
5961 SDValue ThisElt, LastElt;
5962 bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
5963 if (LastIsNonZero) {
5964 LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
5965 MVT::i16, Op.getOperand(i-1));
5967 if (ThisIsNonZero) {
5968 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
5969 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
5970 ThisElt, DAG.getConstant(8, dl, MVT::i8));
5972 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
5976 if (ThisElt.getNode())
5977 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
5978 DAG.getIntPtrConstant(i/2, dl));
5982 return DAG.getBitcast(MVT::v16i8, V);
5985 /// Custom lower build_vector of v8i16.
5986 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
5987 unsigned NumNonZero, unsigned NumZero,
5989 const X86Subtarget &Subtarget,
5990 const TargetLowering &TLI) {
5997 for (unsigned i = 0; i < 8; ++i) {
5998 bool isNonZero = (NonZeros & (1 << i)) != 0;
6002 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6004 V = DAG.getUNDEF(MVT::v8i16);
6007 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
6008 MVT::v8i16, V, Op.getOperand(i),
6009 DAG.getIntPtrConstant(i, dl));
6016 /// Custom lower build_vector of v4i32 or v4f32.
6017 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6018 const X86Subtarget &Subtarget,
6019 const TargetLowering &TLI) {
6020 // Find all zeroable elements.
6021 std::bitset<4> Zeroable;
6022 for (int i=0; i < 4; ++i) {
6023 SDValue Elt = Op->getOperand(i);
6024 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6026 assert(Zeroable.size() - Zeroable.count() > 1 &&
6027 "We expect at least two non-zero elements!");
6029 // We only know how to deal with build_vector nodes where elements are either
6030 // zeroable or extract_vector_elt with constant index.
6031 SDValue FirstNonZero;
6032 unsigned FirstNonZeroIdx;
6033 for (unsigned i=0; i < 4; ++i) {
6036 SDValue Elt = Op->getOperand(i);
6037 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6038 !isa<ConstantSDNode>(Elt.getOperand(1)))
6040 // Make sure that this node is extracting from a 128-bit vector.
6041 MVT VT = Elt.getOperand(0).getSimpleValueType();
6042 if (!VT.is128BitVector())
6044 if (!FirstNonZero.getNode()) {
6046 FirstNonZeroIdx = i;
6050 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6051 SDValue V1 = FirstNonZero.getOperand(0);
6052 MVT VT = V1.getSimpleValueType();
6054 // See if this build_vector can be lowered as a blend with zero.
6056 unsigned EltMaskIdx, EltIdx;
6058 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6059 if (Zeroable[EltIdx]) {
6060 // The zero vector will be on the right hand side.
6061 Mask[EltIdx] = EltIdx+4;
6065 Elt = Op->getOperand(EltIdx);
6066 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6067 EltMaskIdx = cast<ConstantSDNode>(Elt.getOperand(1))->getZExtValue();
6068 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6070 Mask[EltIdx] = EltIdx;
6074 // Let the shuffle legalizer deal with blend operations.
6075 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6076 if (V1.getSimpleValueType() != VT)
6077 V1 = DAG.getBitcast(VT, V1);
6078 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6081 // See if we can lower this build_vector to a INSERTPS.
6082 if (!Subtarget.hasSSE41())
6085 SDValue V2 = Elt.getOperand(0);
6086 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6089 bool CanFold = true;
6090 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6094 SDValue Current = Op->getOperand(i);
6095 SDValue SrcVector = Current->getOperand(0);
6098 CanFold = SrcVector == V1 &&
6099 cast<ConstantSDNode>(Current.getOperand(1))->getZExtValue() == i;
6105 assert(V1.getNode() && "Expected at least two non-zero elements!");
6106 if (V1.getSimpleValueType() != MVT::v4f32)
6107 V1 = DAG.getBitcast(MVT::v4f32, V1);
6108 if (V2.getSimpleValueType() != MVT::v4f32)
6109 V2 = DAG.getBitcast(MVT::v4f32, V2);
6111 // Ok, we can emit an INSERTPS instruction.
6112 unsigned ZMask = Zeroable.to_ulong();
6114 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6115 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6117 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6118 DAG.getIntPtrConstant(InsertPSMask, DL));
6119 return DAG.getBitcast(VT, Result);
6122 /// Return a vector logical shift node.
6123 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6124 SelectionDAG &DAG, const TargetLowering &TLI,
6126 assert(VT.is128BitVector() && "Unknown type for VShift");
6127 MVT ShVT = MVT::v16i8;
6128 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6129 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6130 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6131 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6132 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6133 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6136 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6137 SelectionDAG &DAG) {
6139 // Check if the scalar load can be widened into a vector load. And if
6140 // the address is "base + cst" see if the cst can be "absorbed" into
6141 // the shuffle mask.
6142 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6143 SDValue Ptr = LD->getBasePtr();
6144 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6146 EVT PVT = LD->getValueType(0);
6147 if (PVT != MVT::i32 && PVT != MVT::f32)
6152 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6153 FI = FINode->getIndex();
6155 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6156 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6157 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6158 Offset = Ptr.getConstantOperandVal(1);
6159 Ptr = Ptr.getOperand(0);
6164 // FIXME: 256-bit vector instructions don't require a strict alignment,
6165 // improve this code to support it better.
6166 unsigned RequiredAlign = VT.getSizeInBits()/8;
6167 SDValue Chain = LD->getChain();
6168 // Make sure the stack object alignment is at least 16 or 32.
6169 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6170 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6171 if (MFI.isFixedObjectIndex(FI)) {
6172 // Can't change the alignment. FIXME: It's possible to compute
6173 // the exact stack offset and reference FI + adjust offset instead.
6174 // If someone *really* cares about this. That's the way to implement it.
6177 MFI.setObjectAlignment(FI, RequiredAlign);
6181 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6182 // Ptr + (Offset & ~15).
6185 if ((Offset % RequiredAlign) & 3)
6187 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6190 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6191 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6194 int EltNo = (Offset - StartOffset) >> 2;
6195 unsigned NumElems = VT.getVectorNumElements();
6197 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6198 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6199 LD->getPointerInfo().getWithOffset(StartOffset));
6201 SmallVector<int, 8> Mask(NumElems, EltNo);
6203 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6209 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6210 /// elements can be replaced by a single large load which has the same value as
6211 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6213 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6214 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6215 SDLoc &DL, SelectionDAG &DAG,
6216 bool isAfterLegalize) {
6217 unsigned NumElems = Elts.size();
6219 int LastLoadedElt = -1;
6220 SmallBitVector LoadMask(NumElems, false);
6221 SmallBitVector ZeroMask(NumElems, false);
6222 SmallBitVector UndefMask(NumElems, false);
6224 // For each element in the initializer, see if we've found a load, zero or an
6226 for (unsigned i = 0; i < NumElems; ++i) {
6227 SDValue Elt = peekThroughBitcasts(Elts[i]);
6232 UndefMask[i] = true;
6233 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6235 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6238 // Each loaded element must be the correct fractional portion of the
6239 // requested vector load.
6240 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6245 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6246 "Incomplete element masks");
6248 // Handle Special Cases - all undef or undef/zero.
6249 if (UndefMask.count() == NumElems)
6250 return DAG.getUNDEF(VT);
6252 // FIXME: Should we return this as a BUILD_VECTOR instead?
6253 if ((ZeroMask | UndefMask).count() == NumElems)
6254 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6255 : DAG.getConstantFP(0.0, DL, VT);
6257 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6258 int FirstLoadedElt = LoadMask.find_first();
6259 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6260 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6261 EVT LDBaseVT = EltBase.getValueType();
6263 // Consecutive loads can contain UNDEFS but not ZERO elements.
6264 // Consecutive loads with UNDEFs and ZEROs elements require a
6265 // an additional shuffle stage to clear the ZERO elements.
6266 bool IsConsecutiveLoad = true;
6267 bool IsConsecutiveLoadWithZeros = true;
6268 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6270 SDValue Elt = peekThroughBitcasts(Elts[i]);
6271 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6272 if (!DAG.areNonVolatileConsecutiveLoads(
6273 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6274 i - FirstLoadedElt)) {
6275 IsConsecutiveLoad = false;
6276 IsConsecutiveLoadWithZeros = false;
6279 } else if (ZeroMask[i]) {
6280 IsConsecutiveLoad = false;
6284 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6285 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6286 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6287 "Cannot merge volatile loads.");
6289 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6290 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6292 if (LDBase->hasAnyUseOfValue(1)) {
6294 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6295 SDValue(NewLd.getNode(), 1));
6296 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6297 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6298 SDValue(NewLd.getNode(), 1));
6304 // LOAD - all consecutive load/undefs (must start/end with a load).
6305 // If we have found an entire vector of loads and undefs, then return a large
6306 // load of the entire vector width starting at the base pointer.
6307 // If the vector contains zeros, then attempt to shuffle those elements.
6308 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6309 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6310 assert(LDBase && "Did not find base load for merging consecutive loads");
6311 EVT EltVT = LDBase->getValueType(0);
6312 // Ensure that the input vector size for the merged loads matches the
6313 // cumulative size of the input elements.
6314 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6317 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6320 if (IsConsecutiveLoad)
6321 return CreateLoad(VT, LDBase);
6323 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6324 // vector and a zero vector to clear out the zero elements.
6325 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6326 SmallVector<int, 4> ClearMask(NumElems, -1);
6327 for (unsigned i = 0; i < NumElems; ++i) {
6329 ClearMask[i] = i + NumElems;
6330 else if (LoadMask[i])
6333 SDValue V = CreateLoad(VT, LDBase);
6334 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6335 : DAG.getConstantFP(0.0, DL, VT);
6336 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6341 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6343 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6344 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6345 (LoadSize == 32 || LoadSize == 64) &&
6346 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6347 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6348 : MVT::getIntegerVT(LoadSize);
6349 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6350 if (TLI.isTypeLegal(VecVT)) {
6351 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6352 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6354 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6355 LDBase->getPointerInfo(),
6356 LDBase->getAlignment(),
6357 false/*isVolatile*/, true/*ReadMem*/,
6360 // Make sure the newly-created LOAD is in the same position as LDBase in
6361 // terms of dependency. We create a TokenFactor for LDBase and ResNode,
6362 // and update uses of LDBase's output chain to use the TokenFactor.
6363 if (LDBase->hasAnyUseOfValue(1)) {
6365 DAG.getNode(ISD::TokenFactor, DL, MVT::Other, SDValue(LDBase, 1),
6366 SDValue(ResNode.getNode(), 1));
6367 DAG.ReplaceAllUsesOfValueWith(SDValue(LDBase, 1), NewChain);
6368 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(LDBase, 1),
6369 SDValue(ResNode.getNode(), 1));
6372 return DAG.getBitcast(VT, ResNode);
6379 static Constant *getConstantVector(MVT VT, APInt SplatValue,
6380 unsigned SplatBitSize, LLVMContext &C) {
6381 unsigned ScalarSize = VT.getScalarSizeInBits();
6382 unsigned NumElm = SplatBitSize / ScalarSize;
6384 SmallVector<Constant *, 32> ConstantVec;
6385 for (unsigned i = 0; i < NumElm; i++) {
6386 APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
6388 if (VT.isFloatingPoint()) {
6389 assert((ScalarSize == 32 || ScalarSize == 64) &&
6390 "Unsupported floating point scalar size");
6391 if (ScalarSize == 32)
6392 Const = ConstantFP::get(Type::getFloatTy(C), Val.bitsToFloat());
6394 Const = ConstantFP::get(Type::getDoubleTy(C), Val.bitsToDouble());
6396 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6397 ConstantVec.push_back(Const);
6399 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6402 static bool isUseOfShuffle(SDNode *N) {
6403 for (auto *U : N->uses()) {
6404 if (isTargetShuffle(U->getOpcode()))
6406 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6407 return isUseOfShuffle(U);
6412 /// Attempt to use the vbroadcast instruction to generate a splat value for the
6413 /// following cases:
6414 /// 1. A splat BUILD_VECTOR which uses:
6415 /// a. A single scalar load, or a constant.
6416 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6417 /// 2. A splat shuffle which uses a scalar_to_vector node which comes from
6418 /// a scalar load, or a constant.
6420 /// The VBROADCAST node is returned when a pattern is found,
6421 /// or SDValue() otherwise.
6422 static SDValue LowerVectorBroadcast(BuildVectorSDNode *BVOp, const X86Subtarget &Subtarget,
6423 SelectionDAG &DAG) {
6424 // VBROADCAST requires AVX.
6425 // TODO: Splats could be generated for non-AVX CPUs using SSE
6426 // instructions, but there's less potential gain for only 128-bit vectors.
6427 if (!Subtarget.hasAVX())
6430 MVT VT = BVOp->getSimpleValueType(0);
6433 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6434 "Unsupported vector type for broadcast.");
6436 BitVector UndefElements;
6437 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6439 // We need a splat of a single value to use broadcast, and it doesn't
6440 // make any sense if the value is only in one element of the vector.
6441 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6442 APInt SplatValue, Undef;
6443 unsigned SplatBitSize;
6445 // Check if this is a repeated constant pattern suitable for broadcasting.
6446 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6447 SplatBitSize > VT.getScalarSizeInBits() &&
6448 SplatBitSize < VT.getSizeInBits()) {
6449 // Avoid replacing with broadcast when it's a use of a shuffle
6450 // instruction to preserve the present custom lowering of shuffles.
6451 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6453 // replace BUILD_VECTOR with broadcast of the repeated constants.
6454 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6455 LLVMContext *Ctx = DAG.getContext();
6456 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6457 if (Subtarget.hasAVX()) {
6458 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6459 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6460 // Splatted value can fit in one INTEGER constant in constant pool.
6461 // Load the constant and broadcast it.
6462 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6463 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6464 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6465 SDValue CP = DAG.getConstantPool(C, PVT);
6466 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6468 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6470 CVT, dl, DAG.getEntryNode(), CP,
6471 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6473 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6474 MVT::getVectorVT(CVT, Repeat), Ld);
6475 return DAG.getBitcast(VT, Brdcst);
6476 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6477 // Splatted value can fit in one FLOAT constant in constant pool.
6478 // Load the constant and broadcast it.
6479 // AVX have support for 32 and 64 bit broadcast for floats only.
6480 // No 64bit integer in 32bit subtarget.
6481 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6482 Constant *C = SplatBitSize == 32
6483 ? ConstantFP::get(Type::getFloatTy(*Ctx),
6484 SplatValue.bitsToFloat())
6485 : ConstantFP::get(Type::getDoubleTy(*Ctx),
6486 SplatValue.bitsToDouble());
6487 SDValue CP = DAG.getConstantPool(C, PVT);
6488 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6490 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6492 CVT, dl, DAG.getEntryNode(), CP,
6493 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6495 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6496 MVT::getVectorVT(CVT, Repeat), Ld);
6497 return DAG.getBitcast(VT, Brdcst);
6498 } else if (SplatBitSize > 64) {
6499 // Load the vector of constants and broadcast it.
6500 MVT CVT = VT.getScalarType();
6501 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6503 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6504 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6505 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6507 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6508 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6510 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6511 return DAG.getBitcast(VT, Brdcst);
6518 bool ConstSplatVal =
6519 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6521 // Make sure that all of the users of a non-constant load are from the
6522 // BUILD_VECTOR node.
6523 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6526 unsigned ScalarSize = Ld.getValueSizeInBits();
6527 bool IsGE256 = (VT.getSizeInBits() >= 256);
6529 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6530 // instruction to save 8 or more bytes of constant pool data.
6531 // TODO: If multiple splats are generated to load the same constant,
6532 // it may be detrimental to overall size. There needs to be a way to detect
6533 // that condition to know if this is truly a size win.
6534 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6536 // Handle broadcasting a single constant scalar from the constant pool
6538 // On Sandybridge (no AVX2), it is still better to load a constant vector
6539 // from the constant pool and not to broadcast it from a scalar.
6540 // But override that restriction when optimizing for size.
6541 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6542 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6543 EVT CVT = Ld.getValueType();
6544 assert(!CVT.isVector() && "Must not broadcast a vector type");
6546 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6547 // For size optimization, also splat v2f64 and v2i64, and for size opt
6548 // with AVX2, also splat i8 and i16.
6549 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6550 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6551 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6552 const Constant *C = nullptr;
6553 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6554 C = CI->getConstantIntValue();
6555 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6556 C = CF->getConstantFPValue();
6558 assert(C && "Invalid constant type");
6560 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6562 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6563 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6565 CVT, dl, DAG.getEntryNode(), CP,
6566 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6569 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6573 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6575 // Handle AVX2 in-register broadcasts.
6576 if (!IsLoad && Subtarget.hasInt256() &&
6577 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6578 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6580 // The scalar source must be a normal load.
6584 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6585 (Subtarget.hasVLX() && ScalarSize == 64))
6586 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6588 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6589 // double since there is no vbroadcastsd xmm
6590 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6591 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6592 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6595 // Unsupported broadcast.
6599 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6600 /// underlying vector and index.
6602 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6604 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6606 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6607 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6610 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6612 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6614 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6615 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6618 // In this case the vector is the extract_subvector expression and the index
6619 // is 2, as specified by the shuffle.
6620 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6621 SDValue ShuffleVec = SVOp->getOperand(0);
6622 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6623 assert(ShuffleVecVT.getVectorElementType() ==
6624 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6626 int ShuffleIdx = SVOp->getMaskElt(Idx);
6627 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6628 ExtractedFromVec = ShuffleVec;
6634 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6635 MVT VT = Op.getSimpleValueType();
6637 // Skip if insert_vec_elt is not supported.
6638 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6639 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6643 unsigned NumElems = Op.getNumOperands();
6647 SmallVector<unsigned, 4> InsertIndices;
6648 SmallVector<int, 8> Mask(NumElems, -1);
6650 for (unsigned i = 0; i != NumElems; ++i) {
6651 unsigned Opc = Op.getOperand(i).getOpcode();
6653 if (Opc == ISD::UNDEF)
6656 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6657 // Quit if more than 1 elements need inserting.
6658 if (InsertIndices.size() > 1)
6661 InsertIndices.push_back(i);
6665 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6666 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6667 // Quit if non-constant index.
6668 if (!isa<ConstantSDNode>(ExtIdx))
6670 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6672 // Quit if extracted from vector of different type.
6673 if (ExtractedFromVec.getValueType() != VT)
6676 if (!VecIn1.getNode())
6677 VecIn1 = ExtractedFromVec;
6678 else if (VecIn1 != ExtractedFromVec) {
6679 if (!VecIn2.getNode())
6680 VecIn2 = ExtractedFromVec;
6681 else if (VecIn2 != ExtractedFromVec)
6682 // Quit if more than 2 vectors to shuffle
6686 if (ExtractedFromVec == VecIn1)
6688 else if (ExtractedFromVec == VecIn2)
6689 Mask[i] = Idx + NumElems;
6692 if (!VecIn1.getNode())
6695 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6696 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6697 for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
6698 unsigned Idx = InsertIndices[i];
6699 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6700 DAG.getIntPtrConstant(Idx, DL));
6706 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6707 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6708 Op.getScalarValueSizeInBits() == 1 &&
6709 "Can not convert non-constant vector");
6710 uint64_t Immediate = 0;
6711 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6712 SDValue In = Op.getOperand(idx);
6714 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6717 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
6718 return DAG.getConstant(Immediate, dl, VT);
6720 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
6722 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
6724 MVT VT = Op.getSimpleValueType();
6725 assert((VT.getVectorElementType() == MVT::i1) &&
6726 "Unexpected type in LowerBUILD_VECTORvXi1!");
6729 if (ISD::isBuildVectorAllZeros(Op.getNode()))
6730 return DAG.getTargetConstant(0, dl, VT);
6732 if (ISD::isBuildVectorAllOnes(Op.getNode()))
6733 return DAG.getTargetConstant(1, dl, VT);
6735 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
6736 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
6737 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6738 return DAG.getBitcast(VT, Imm);
6739 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6740 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6741 DAG.getIntPtrConstant(0, dl));
6744 // Vector has one or more non-const elements
6745 uint64_t Immediate = 0;
6746 SmallVector<unsigned, 16> NonConstIdx;
6747 bool IsSplat = true;
6748 bool HasConstElts = false;
6750 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
6751 SDValue In = Op.getOperand(idx);
6754 if (!isa<ConstantSDNode>(In))
6755 NonConstIdx.push_back(idx);
6757 Immediate |= cast<ConstantSDNode>(In)->getZExtValue() << idx;
6758 HasConstElts = true;
6762 else if (In != Op.getOperand(SplatIdx))
6766 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
6768 return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx),
6769 DAG.getConstant(1, dl, VT),
6770 DAG.getConstant(0, dl, VT));
6772 // insert elements one by one
6776 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
6777 Imm = DAG.getConstant(Immediate, dl, ImmVT);
6779 else if (HasConstElts)
6780 Imm = DAG.getConstant(0, dl, VT);
6782 Imm = DAG.getUNDEF(VT);
6783 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
6784 DstVec = DAG.getBitcast(VT, Imm);
6786 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
6787 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
6788 DAG.getIntPtrConstant(0, dl));
6791 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
6792 unsigned InsertIdx = NonConstIdx[i];
6793 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
6794 Op.getOperand(InsertIdx),
6795 DAG.getIntPtrConstant(InsertIdx, dl));
6800 /// \brief Return true if \p N implements a horizontal binop and return the
6801 /// operands for the horizontal binop into V0 and V1.
6803 /// This is a helper function of LowerToHorizontalOp().
6804 /// This function checks that the build_vector \p N in input implements a
6805 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
6806 /// operation to match.
6807 /// For example, if \p Opcode is equal to ISD::ADD, then this function
6808 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
6809 /// is equal to ISD::SUB, then this function checks if this is a horizontal
6812 /// This function only analyzes elements of \p N whose indices are
6813 /// in range [BaseIdx, LastIdx).
6814 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
6816 unsigned BaseIdx, unsigned LastIdx,
6817 SDValue &V0, SDValue &V1) {
6818 EVT VT = N->getValueType(0);
6820 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
6821 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
6822 "Invalid Vector in input!");
6824 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
6825 bool CanFold = true;
6826 unsigned ExpectedVExtractIdx = BaseIdx;
6827 unsigned NumElts = LastIdx - BaseIdx;
6828 V0 = DAG.getUNDEF(VT);
6829 V1 = DAG.getUNDEF(VT);
6831 // Check if N implements a horizontal binop.
6832 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
6833 SDValue Op = N->getOperand(i + BaseIdx);
6836 if (Op->isUndef()) {
6837 // Update the expected vector extract index.
6838 if (i * 2 == NumElts)
6839 ExpectedVExtractIdx = BaseIdx;
6840 ExpectedVExtractIdx += 2;
6844 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
6849 SDValue Op0 = Op.getOperand(0);
6850 SDValue Op1 = Op.getOperand(1);
6852 // Try to match the following pattern:
6853 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
6854 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6855 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
6856 Op0.getOperand(0) == Op1.getOperand(0) &&
6857 isa<ConstantSDNode>(Op0.getOperand(1)) &&
6858 isa<ConstantSDNode>(Op1.getOperand(1)));
6862 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
6863 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
6865 if (i * 2 < NumElts) {
6867 V0 = Op0.getOperand(0);
6868 if (V0.getValueType() != VT)
6873 V1 = Op0.getOperand(0);
6874 if (V1.getValueType() != VT)
6877 if (i * 2 == NumElts)
6878 ExpectedVExtractIdx = BaseIdx;
6881 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
6882 if (I0 == ExpectedVExtractIdx)
6883 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
6884 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
6885 // Try to match the following dag sequence:
6886 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
6887 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
6891 ExpectedVExtractIdx += 2;
6897 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
6898 /// a concat_vector.
6900 /// This is a helper function of LowerToHorizontalOp().
6901 /// This function expects two 256-bit vectors called V0 and V1.
6902 /// At first, each vector is split into two separate 128-bit vectors.
6903 /// Then, the resulting 128-bit vectors are used to implement two
6904 /// horizontal binary operations.
6906 /// The kind of horizontal binary operation is defined by \p X86Opcode.
6908 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
6909 /// the two new horizontal binop.
6910 /// When Mode is set, the first horizontal binop dag node would take as input
6911 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
6912 /// horizontal binop dag node would take as input the lower 128-bit of V1
6913 /// and the upper 128-bit of V1.
6915 /// HADD V0_LO, V0_HI
6916 /// HADD V1_LO, V1_HI
6918 /// Otherwise, the first horizontal binop dag node takes as input the lower
6919 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
6920 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
6922 /// HADD V0_LO, V1_LO
6923 /// HADD V0_HI, V1_HI
6925 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
6926 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
6927 /// the upper 128-bits of the result.
6928 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
6929 const SDLoc &DL, SelectionDAG &DAG,
6930 unsigned X86Opcode, bool Mode,
6931 bool isUndefLO, bool isUndefHI) {
6932 MVT VT = V0.getSimpleValueType();
6933 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
6934 "Invalid nodes in input!");
6936 unsigned NumElts = VT.getVectorNumElements();
6937 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
6938 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
6939 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
6940 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
6941 MVT NewVT = V0_LO.getSimpleValueType();
6943 SDValue LO = DAG.getUNDEF(NewVT);
6944 SDValue HI = DAG.getUNDEF(NewVT);
6947 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6948 if (!isUndefLO && !V0->isUndef())
6949 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
6950 if (!isUndefHI && !V1->isUndef())
6951 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
6953 // Don't emit a horizontal binop if the result is expected to be UNDEF.
6954 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
6955 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
6957 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
6958 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
6961 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
6964 /// Returns true iff \p BV builds a vector with the result equivalent to
6965 /// the result of ADDSUB operation.
6966 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
6967 /// are written to the parameters \p Opnd0 and \p Opnd1.
6968 static bool isAddSub(const BuildVectorSDNode *BV,
6969 const X86Subtarget &Subtarget, SelectionDAG &DAG,
6970 SDValue &Opnd0, SDValue &Opnd1) {
6972 MVT VT = BV->getSimpleValueType(0);
6973 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
6974 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
6975 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
6978 unsigned NumElts = VT.getVectorNumElements();
6979 SDValue InVec0 = DAG.getUNDEF(VT);
6980 SDValue InVec1 = DAG.getUNDEF(VT);
6982 // Odd-numbered elements in the input build vector are obtained from
6983 // adding two integer/float elements.
6984 // Even-numbered elements in the input build vector are obtained from
6985 // subtracting two integer/float elements.
6986 unsigned ExpectedOpcode = ISD::FSUB;
6987 unsigned NextExpectedOpcode = ISD::FADD;
6988 bool AddFound = false;
6989 bool SubFound = false;
6991 for (unsigned i = 0, e = NumElts; i != e; ++i) {
6992 SDValue Op = BV->getOperand(i);
6994 // Skip 'undef' values.
6995 unsigned Opcode = Op.getOpcode();
6996 if (Opcode == ISD::UNDEF) {
6997 std::swap(ExpectedOpcode, NextExpectedOpcode);
7001 // Early exit if we found an unexpected opcode.
7002 if (Opcode != ExpectedOpcode)
7005 SDValue Op0 = Op.getOperand(0);
7006 SDValue Op1 = Op.getOperand(1);
7008 // Try to match the following pattern:
7009 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7010 // Early exit if we cannot match that sequence.
7011 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7012 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7013 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7014 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7015 Op0.getOperand(1) != Op1.getOperand(1))
7018 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7022 // We found a valid add/sub node. Update the information accordingly.
7028 // Update InVec0 and InVec1.
7029 if (InVec0.isUndef()) {
7030 InVec0 = Op0.getOperand(0);
7031 if (InVec0.getSimpleValueType() != VT)
7034 if (InVec1.isUndef()) {
7035 InVec1 = Op1.getOperand(0);
7036 if (InVec1.getSimpleValueType() != VT)
7040 // Make sure that operands in input to each add/sub node always
7041 // come from a same pair of vectors.
7042 if (InVec0 != Op0.getOperand(0)) {
7043 if (ExpectedOpcode == ISD::FSUB)
7046 // FADD is commutable. Try to commute the operands
7047 // and then test again.
7048 std::swap(Op0, Op1);
7049 if (InVec0 != Op0.getOperand(0))
7053 if (InVec1 != Op1.getOperand(0))
7056 // Update the pair of expected opcodes.
7057 std::swap(ExpectedOpcode, NextExpectedOpcode);
7060 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7061 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7069 /// Returns true if is possible to fold MUL and an idiom that has already been
7070 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7071 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7072 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7074 /// Prior to calling this function it should be known that there is some
7075 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7076 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7077 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7078 /// of \p Opnd0 uses is expected to be equal to 2.
7079 /// For example, this function may be called for the following IR:
7080 /// %AB = fmul fast <2 x double> %A, %B
7081 /// %Sub = fsub fast <2 x double> %AB, %C
7082 /// %Add = fadd fast <2 x double> %AB, %C
7083 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7084 /// <2 x i32> <i32 0, i32 3>
7085 /// There is a def for %Addsub here, which potentially can be replaced by
7086 /// X86ISD::ADDSUB operation:
7087 /// %Addsub = X86ISD::ADDSUB %AB, %C
7088 /// and such ADDSUB can further be replaced with FMADDSUB:
7089 /// %Addsub = FMADDSUB %A, %B, %C.
7091 /// The main reason why this method is called before the replacement of the
7092 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7093 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7095 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7096 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7097 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7098 !Subtarget.hasAnyFMA())
7101 // FIXME: These checks must match the similar ones in
7102 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7103 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7104 // or MUL + ADDSUB to FMADDSUB.
7105 const TargetOptions &Options = DAG.getTarget().Options;
7107 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7112 Opnd1 = Opnd0.getOperand(1);
7113 Opnd0 = Opnd0.getOperand(0);
7118 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7119 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7120 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7121 const X86Subtarget &Subtarget,
7122 SelectionDAG &DAG) {
7123 SDValue Opnd0, Opnd1;
7124 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7127 MVT VT = BV->getSimpleValueType(0);
7130 // Try to generate X86ISD::FMADDSUB node here.
7132 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7133 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7135 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7136 // the ADDSUB idiom has been successfully recognized. There are no known
7137 // X86 targets with 512-bit ADDSUB instructions!
7138 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7140 if (VT.is512BitVector())
7143 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7146 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7147 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7148 const X86Subtarget &Subtarget,
7149 SelectionDAG &DAG) {
7150 MVT VT = BV->getSimpleValueType(0);
7151 unsigned NumElts = VT.getVectorNumElements();
7152 unsigned NumUndefsLO = 0;
7153 unsigned NumUndefsHI = 0;
7154 unsigned Half = NumElts/2;
7156 // Count the number of UNDEF operands in the build_vector in input.
7157 for (unsigned i = 0, e = Half; i != e; ++i)
7158 if (BV->getOperand(i)->isUndef())
7161 for (unsigned i = Half, e = NumElts; i != e; ++i)
7162 if (BV->getOperand(i)->isUndef())
7165 // Early exit if this is either a build_vector of all UNDEFs or all the
7166 // operands but one are UNDEF.
7167 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7171 SDValue InVec0, InVec1;
7172 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7173 // Try to match an SSE3 float HADD/HSUB.
7174 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7175 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7177 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7178 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7179 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7180 // Try to match an SSSE3 integer HADD/HSUB.
7181 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7182 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7184 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7185 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7188 if (!Subtarget.hasAVX())
7191 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7192 // Try to match an AVX horizontal add/sub of packed single/double
7193 // precision floating point values from 256-bit vectors.
7194 SDValue InVec2, InVec3;
7195 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7196 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7197 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7198 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7199 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7201 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7202 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7203 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7204 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7205 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7206 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7207 // Try to match an AVX2 horizontal add/sub of signed integers.
7208 SDValue InVec2, InVec3;
7210 bool CanFold = true;
7212 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7213 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7214 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7215 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7216 X86Opcode = X86ISD::HADD;
7217 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7218 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7219 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7220 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7221 X86Opcode = X86ISD::HSUB;
7226 // Fold this build_vector into a single horizontal add/sub.
7227 // Do this only if the target has AVX2.
7228 if (Subtarget.hasAVX2())
7229 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7231 // Do not try to expand this build_vector into a pair of horizontal
7232 // add/sub if we can emit a pair of scalar add/sub.
7233 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7236 // Convert this build_vector into a pair of horizontal binop followed by
7238 bool isUndefLO = NumUndefsLO == Half;
7239 bool isUndefHI = NumUndefsHI == Half;
7240 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7241 isUndefLO, isUndefHI);
7245 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7246 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7248 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7249 X86Opcode = X86ISD::HADD;
7250 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7251 X86Opcode = X86ISD::HSUB;
7252 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7253 X86Opcode = X86ISD::FHADD;
7254 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7255 X86Opcode = X86ISD::FHSUB;
7259 // Don't try to expand this build_vector into a pair of horizontal add/sub
7260 // if we can simply emit a pair of scalar add/sub.
7261 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7264 // Convert this build_vector into two horizontal add/sub followed by
7266 bool isUndefLO = NumUndefsLO == Half;
7267 bool isUndefHI = NumUndefsHI == Half;
7268 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7269 isUndefLO, isUndefHI);
7275 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7276 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7277 /// just apply the bit to the vectors.
7278 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7279 /// from this, but enough scalar bit operations are created from the later
7280 /// legalization + scalarization stages to need basic support.
7281 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7282 SelectionDAG &DAG) {
7284 MVT VT = Op->getSimpleValueType(0);
7285 unsigned NumElems = VT.getVectorNumElements();
7286 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7288 // Check that all elements have the same opcode.
7289 // TODO: Should we allow UNDEFS and if so how many?
7290 unsigned Opcode = Op->getOperand(0).getOpcode();
7291 for (unsigned i = 1; i < NumElems; ++i)
7292 if (Opcode != Op->getOperand(i).getOpcode())
7295 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7302 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7307 SmallVector<SDValue, 4> LHSElts, RHSElts;
7308 for (SDValue Elt : Op->ops()) {
7309 SDValue LHS = Elt.getOperand(0);
7310 SDValue RHS = Elt.getOperand(1);
7312 // We expect the canonicalized RHS operand to be the constant.
7313 if (!isa<ConstantSDNode>(RHS))
7315 LHSElts.push_back(LHS);
7316 RHSElts.push_back(RHS);
7319 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7320 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7321 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7324 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7325 /// functionality to do this, so it's all zeros, all ones, or some derivation
7326 /// that is cheap to calculate.
7327 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7328 const X86Subtarget &Subtarget) {
7330 MVT VT = Op.getSimpleValueType();
7332 // Vectors containing all zeros can be matched by pxor and xorps.
7333 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7334 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7335 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7336 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7339 return getZeroVector(VT, Subtarget, DAG, DL);
7342 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7343 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7344 // vpcmpeqd on 256-bit vectors.
7345 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7346 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7347 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7350 return getOnesVector(VT, Subtarget, DAG, DL);
7357 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7360 MVT VT = Op.getSimpleValueType();
7361 MVT ExtVT = VT.getVectorElementType();
7362 unsigned NumElems = Op.getNumOperands();
7364 // Generate vectors for predicate vectors.
7365 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7366 return LowerBUILD_VECTORvXi1(Op, DAG);
7368 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7369 return VectorConstant;
7371 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7372 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7374 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7375 return HorizontalOp;
7376 if (SDValue Broadcast = LowerVectorBroadcast(BV, Subtarget, DAG))
7378 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7381 unsigned EVTBits = ExtVT.getSizeInBits();
7383 unsigned NumZero = 0;
7384 unsigned NumNonZero = 0;
7385 uint64_t NonZeros = 0;
7386 bool IsAllConstants = true;
7387 SmallSet<SDValue, 8> Values;
7388 for (unsigned i = 0; i < NumElems; ++i) {
7389 SDValue Elt = Op.getOperand(i);
7393 if (Elt.getOpcode() != ISD::Constant &&
7394 Elt.getOpcode() != ISD::ConstantFP)
7395 IsAllConstants = false;
7396 if (X86::isZeroNode(Elt))
7399 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7400 NonZeros |= ((uint64_t)1 << i);
7405 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7406 if (NumNonZero == 0)
7407 return DAG.getUNDEF(VT);
7409 // Special case for single non-zero, non-undef, element.
7410 if (NumNonZero == 1) {
7411 unsigned Idx = countTrailingZeros(NonZeros);
7412 SDValue Item = Op.getOperand(Idx);
7414 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7415 // the value are obviously zero, truncate the value to i32 and do the
7416 // insertion that way. Only do this if the value is non-constant or if the
7417 // value is a constant being inserted into element 0. It is cheaper to do
7418 // a constant pool load than it is to do a movd + shuffle.
7419 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7420 (!IsAllConstants || Idx == 0)) {
7421 if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
7423 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7424 MVT VecVT = MVT::v4i32;
7426 // Truncate the value (which may itself be a constant) to i32, and
7427 // convert it to a vector with movd (S2V+shuffle to zero extend).
7428 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7429 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7430 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7431 Item, Idx * 2, true, Subtarget, DAG));
7435 // If we have a constant or non-constant insertion into the low element of
7436 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7437 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7438 // depending on what the source datatype is.
7441 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7443 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7444 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7445 assert((VT.is128BitVector() || VT.is256BitVector() ||
7446 VT.is512BitVector()) &&
7447 "Expected an SSE value type!");
7448 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7449 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7450 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7453 // We can't directly insert an i8 or i16 into a vector, so zero extend
7455 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7456 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7457 if (VT.getSizeInBits() >= 256) {
7458 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7459 if (Subtarget.hasAVX()) {
7460 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7461 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7463 // Without AVX, we need to extend to a 128-bit vector and then
7464 // insert into the 256-bit vector.
7465 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7466 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7467 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7470 assert(VT.is128BitVector() && "Expected an SSE value type!");
7471 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7472 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7474 return DAG.getBitcast(VT, Item);
7478 // Is it a vector logical left shift?
7479 if (NumElems == 2 && Idx == 1 &&
7480 X86::isZeroNode(Op.getOperand(0)) &&
7481 !X86::isZeroNode(Op.getOperand(1))) {
7482 unsigned NumBits = VT.getSizeInBits();
7483 return getVShift(true, VT,
7484 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7485 VT, Op.getOperand(1)),
7486 NumBits/2, DAG, *this, dl);
7489 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7492 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7493 // is a non-constant being inserted into an element other than the low one,
7494 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7495 // movd/movss) to move this into the low element, then shuffle it into
7497 if (EVTBits == 32) {
7498 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7499 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7503 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7504 if (Values.size() == 1) {
7505 if (EVTBits == 32) {
7506 // Instead of a shuffle like this:
7507 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7508 // Check if it's possible to issue this instead.
7509 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7510 unsigned Idx = countTrailingZeros(NonZeros);
7511 SDValue Item = Op.getOperand(Idx);
7512 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7513 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7518 // A vector full of immediates; various special cases are already
7519 // handled, so this is best done with a single constant-pool load.
7523 // See if we can use a vector load to get all of the elements.
7524 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7525 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7526 if (SDValue LD = EltsFromConsecutiveLoads(VT, Ops, dl, DAG, false))
7530 // For AVX-length vectors, build the individual 128-bit pieces and use
7531 // shuffles to put them in place.
7532 if (VT.is256BitVector() || VT.is512BitVector()) {
7533 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7535 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7537 // Build both the lower and upper subvector.
7539 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7540 SDValue Upper = DAG.getBuildVector(
7541 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7543 // Recreate the wider vector with the lower and upper part.
7544 if (VT.is256BitVector())
7545 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7546 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7549 // Let legalizer expand 2-wide build_vectors.
7550 if (EVTBits == 64) {
7551 if (NumNonZero == 1) {
7552 // One half is zero or undef.
7553 unsigned Idx = countTrailingZeros(NonZeros);
7554 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7555 Op.getOperand(Idx));
7556 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7561 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7562 if (EVTBits == 8 && NumElems == 16)
7563 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7564 DAG, Subtarget, *this))
7567 if (EVTBits == 16 && NumElems == 8)
7568 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7569 DAG, Subtarget, *this))
7572 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7573 if (EVTBits == 32 && NumElems == 4)
7574 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
7577 // If element VT is == 32 bits, turn it into a number of shuffles.
7578 if (NumElems == 4 && NumZero > 0) {
7579 SmallVector<SDValue, 8> Ops(NumElems);
7580 for (unsigned i = 0; i < 4; ++i) {
7581 bool isZero = !(NonZeros & (1ULL << i));
7583 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7585 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7588 for (unsigned i = 0; i < 2; ++i) {
7589 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7592 Ops[i] = Ops[i*2]; // Must be a zero vector.
7595 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7598 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7601 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7606 bool Reverse1 = (NonZeros & 0x3) == 2;
7607 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7611 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7612 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7614 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7617 if (Values.size() > 1 && VT.is128BitVector()) {
7618 // Check for a build vector from mostly shuffle plus few inserting.
7619 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7622 // For SSE 4.1, use insertps to put the high elements into the low element.
7623 if (Subtarget.hasSSE41()) {
7625 if (!Op.getOperand(0).isUndef())
7626 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7628 Result = DAG.getUNDEF(VT);
7630 for (unsigned i = 1; i < NumElems; ++i) {
7631 if (Op.getOperand(i).isUndef()) continue;
7632 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7633 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7638 // Otherwise, expand into a number of unpckl*, start by extending each of
7639 // our (non-undef) elements to the full vector width with the element in the
7640 // bottom slot of the vector (which generates no code for SSE).
7641 SmallVector<SDValue, 8> Ops(NumElems);
7642 for (unsigned i = 0; i < NumElems; ++i) {
7643 if (!Op.getOperand(i).isUndef())
7644 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7646 Ops[i] = DAG.getUNDEF(VT);
7649 // Next, we iteratively mix elements, e.g. for v4f32:
7650 // Step 1: unpcklps 0, 2 ==> X: <?, ?, 2, 0>
7651 // : unpcklps 1, 3 ==> Y: <?, ?, 3, 1>
7652 // Step 2: unpcklps X, Y ==> <3, 2, 1, 0>
7653 unsigned EltStride = NumElems >> 1;
7654 while (EltStride != 0) {
7655 for (unsigned i = 0; i < EltStride; ++i) {
7656 // If Ops[i+EltStride] is undef and this is the first round of mixing,
7657 // then it is safe to just drop this shuffle: V[i] is already in the
7658 // right place, the one element (since it's the first round) being
7659 // inserted as undef can be dropped. This isn't safe for successive
7660 // rounds because they will permute elements within both vectors.
7661 if (Ops[i+EltStride].isUndef() &&
7662 EltStride == NumElems/2)
7665 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i], Ops[i + EltStride]);
7674 // 256-bit AVX can use the vinsertf128 instruction
7675 // to create 256-bit vectors from two other 128-bit ones.
7676 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7678 MVT ResVT = Op.getSimpleValueType();
7680 assert((ResVT.is256BitVector() ||
7681 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7683 SDValue V1 = Op.getOperand(0);
7684 SDValue V2 = Op.getOperand(1);
7685 unsigned NumElems = ResVT.getVectorNumElements();
7686 if (ResVT.is256BitVector())
7687 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7689 if (Op.getNumOperands() == 4) {
7690 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7691 ResVT.getVectorNumElements()/2);
7692 SDValue V3 = Op.getOperand(2);
7693 SDValue V4 = Op.getOperand(3);
7694 return concat256BitVectors(
7695 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7696 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7699 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7702 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
7703 const X86Subtarget &Subtarget,
7704 SelectionDAG & DAG) {
7706 MVT ResVT = Op.getSimpleValueType();
7707 unsigned NumOfOperands = Op.getNumOperands();
7709 assert(isPowerOf2_32(NumOfOperands) &&
7710 "Unexpected number of operands in CONCAT_VECTORS");
7712 SDValue Undef = DAG.getUNDEF(ResVT);
7713 if (NumOfOperands > 2) {
7714 // Specialize the cases when all, or all but one, of the operands are undef.
7715 unsigned NumOfDefinedOps = 0;
7717 for (unsigned i = 0; i < NumOfOperands; i++)
7718 if (!Op.getOperand(i).isUndef()) {
7722 if (NumOfDefinedOps == 0)
7724 if (NumOfDefinedOps == 1) {
7725 unsigned SubVecNumElts =
7726 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
7727 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
7728 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
7729 Op.getOperand(OpIdx), IdxVal);
7732 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7733 ResVT.getVectorNumElements()/2);
7734 SmallVector<SDValue, 2> Ops;
7735 for (unsigned i = 0; i < NumOfOperands/2; i++)
7736 Ops.push_back(Op.getOperand(i));
7737 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7739 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
7740 Ops.push_back(Op.getOperand(i));
7741 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
7742 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
7746 SDValue V1 = Op.getOperand(0);
7747 SDValue V2 = Op.getOperand(1);
7748 unsigned NumElems = ResVT.getVectorNumElements();
7749 assert(V1.getValueType() == V2.getValueType() &&
7750 V1.getValueType().getVectorNumElements() == NumElems/2 &&
7751 "Unexpected operands in CONCAT_VECTORS");
7753 if (ResVT.getSizeInBits() >= 16)
7754 return Op; // The operation is legal with KUNPCK
7756 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
7757 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
7758 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
7759 if (IsZeroV1 && IsZeroV2)
7762 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
7764 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7766 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
7768 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
7770 V2 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
7773 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
7775 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
7776 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
7779 static SDValue LowerCONCAT_VECTORS(SDValue Op,
7780 const X86Subtarget &Subtarget,
7781 SelectionDAG &DAG) {
7782 MVT VT = Op.getSimpleValueType();
7783 if (VT.getVectorElementType() == MVT::i1)
7784 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
7786 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
7787 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
7788 Op.getNumOperands() == 4)));
7790 // AVX can use the vinsertf128 instruction to create 256-bit vectors
7791 // from two other 128-bit ones.
7793 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
7794 return LowerAVXCONCAT_VECTORS(Op, DAG);
7797 //===----------------------------------------------------------------------===//
7798 // Vector shuffle lowering
7800 // This is an experimental code path for lowering vector shuffles on x86. It is
7801 // designed to handle arbitrary vector shuffles and blends, gracefully
7802 // degrading performance as necessary. It works hard to recognize idiomatic
7803 // shuffles and lower them to optimal instruction patterns without leaving
7804 // a framework that allows reasonably efficient handling of all vector shuffle
7806 //===----------------------------------------------------------------------===//
7808 /// \brief Tiny helper function to identify a no-op mask.
7810 /// This is a somewhat boring predicate function. It checks whether the mask
7811 /// array input, which is assumed to be a single-input shuffle mask of the kind
7812 /// used by the X86 shuffle instructions (not a fully general
7813 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
7814 /// in-place shuffle are 'no-op's.
7815 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
7816 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
7817 assert(Mask[i] >= -1 && "Out of bound mask element!");
7818 if (Mask[i] >= 0 && Mask[i] != i)
7824 /// \brief Test whether there are elements crossing 128-bit lanes in this
7827 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
7828 /// and we routinely test for these.
7829 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
7830 int LaneSize = 128 / VT.getScalarSizeInBits();
7831 int Size = Mask.size();
7832 for (int i = 0; i < Size; ++i)
7833 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
7838 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
7840 /// This checks a shuffle mask to see if it is performing the same
7841 /// lane-relative shuffle in each sub-lane. This trivially implies
7842 /// that it is also not lane-crossing. It may however involve a blend from the
7843 /// same lane of a second vector.
7845 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
7846 /// non-trivial to compute in the face of undef lanes. The representation is
7847 /// suitable for use with existing 128-bit shuffles as entries from the second
7848 /// vector have been remapped to [LaneSize, 2*LaneSize).
7849 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
7851 SmallVectorImpl<int> &RepeatedMask) {
7852 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7853 RepeatedMask.assign(LaneSize, -1);
7854 int Size = Mask.size();
7855 for (int i = 0; i < Size; ++i) {
7856 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
7859 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7860 // This entry crosses lanes, so there is no way to model this shuffle.
7863 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7864 // Adjust second vector indices to start at LaneSize instead of Size.
7865 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
7866 : Mask[i] % LaneSize + LaneSize;
7867 if (RepeatedMask[i % LaneSize] < 0)
7868 // This is the first non-undef entry in this slot of a 128-bit lane.
7869 RepeatedMask[i % LaneSize] = LocalM;
7870 else if (RepeatedMask[i % LaneSize] != LocalM)
7871 // Found a mismatch with the repeated mask.
7877 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
7879 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7880 SmallVectorImpl<int> &RepeatedMask) {
7881 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
7884 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
7886 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
7887 SmallVectorImpl<int> &RepeatedMask) {
7888 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
7891 /// Test whether a target shuffle mask is equivalent within each sub-lane.
7892 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
7893 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
7895 SmallVectorImpl<int> &RepeatedMask) {
7896 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
7897 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
7898 int Size = Mask.size();
7899 for (int i = 0; i < Size; ++i) {
7900 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
7901 if (Mask[i] == SM_SentinelUndef)
7903 if (Mask[i] == SM_SentinelZero) {
7904 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
7906 RepeatedMask[i % LaneSize] = SM_SentinelZero;
7909 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
7910 // This entry crosses lanes, so there is no way to model this shuffle.
7913 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
7914 // Adjust second vector indices to start at LaneSize instead of Size.
7916 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
7917 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
7918 // This is the first non-undef entry in this slot of a 128-bit lane.
7919 RepeatedMask[i % LaneSize] = LocalM;
7920 else if (RepeatedMask[i % LaneSize] != LocalM)
7921 // Found a mismatch with the repeated mask.
7927 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
7930 /// This is a fast way to test a shuffle mask against a fixed pattern:
7932 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
7934 /// It returns true if the mask is exactly as wide as the argument list, and
7935 /// each element of the mask is either -1 (signifying undef) or the value given
7936 /// in the argument.
7937 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
7938 ArrayRef<int> ExpectedMask) {
7939 if (Mask.size() != ExpectedMask.size())
7942 int Size = Mask.size();
7944 // If the values are build vectors, we can look through them to find
7945 // equivalent inputs that make the shuffles equivalent.
7946 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
7947 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
7949 for (int i = 0; i < Size; ++i) {
7950 assert(Mask[i] >= -1 && "Out of bound mask element!");
7951 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
7952 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
7953 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
7954 if (!MaskBV || !ExpectedBV ||
7955 MaskBV->getOperand(Mask[i] % Size) !=
7956 ExpectedBV->getOperand(ExpectedMask[i] % Size))
7964 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
7966 /// The masks must be exactly the same width.
7968 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
7969 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
7971 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
7972 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
7973 ArrayRef<int> ExpectedMask) {
7974 int Size = Mask.size();
7975 if (Size != (int)ExpectedMask.size())
7978 for (int i = 0; i < Size; ++i)
7979 if (Mask[i] == SM_SentinelUndef)
7981 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
7983 else if (Mask[i] != ExpectedMask[i])
7989 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
7991 /// This helper function produces an 8-bit shuffle immediate corresponding to
7992 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
7993 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
7996 /// NB: We rely heavily on "undef" masks preserving the input lane.
7997 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
7998 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
7999 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8000 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8001 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8002 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8005 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8006 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8007 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8008 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8012 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, SDLoc DL,
8013 SelectionDAG &DAG) {
8014 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8017 /// \brief Compute whether each element of a shuffle is zeroable.
8019 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8020 /// Either it is an undef element in the shuffle mask, the element of the input
8021 /// referenced is undef, or the element of the input referenced is known to be
8022 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8023 /// as many lanes with this technique as possible to simplify the remaining
8025 static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
8026 SDValue V1, SDValue V2) {
8027 SmallBitVector Zeroable(Mask.size(), false);
8028 V1 = peekThroughBitcasts(V1);
8029 V2 = peekThroughBitcasts(V2);
8031 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8032 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8034 int VectorSizeInBits = V1.getValueSizeInBits();
8035 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8036 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8038 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8040 // Handle the easy cases.
8041 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8046 // Determine shuffle input and normalize the mask.
8047 SDValue V = M < Size ? V1 : V2;
8050 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8051 if (V.getOpcode() != ISD::BUILD_VECTOR)
8054 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8055 // the (larger) source element must be UNDEF/ZERO.
8056 if ((Size % V.getNumOperands()) == 0) {
8057 int Scale = Size / V->getNumOperands();
8058 SDValue Op = V.getOperand(M / Scale);
8059 if (Op.isUndef() || X86::isZeroNode(Op))
8061 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8062 APInt Val = Cst->getAPIntValue();
8063 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
8064 Val = Val.getLoBits(ScalarSizeInBits);
8065 Zeroable[i] = (Val == 0);
8066 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8067 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8068 Val = Val.lshr((M % Scale) * ScalarSizeInBits);
8069 Val = Val.getLoBits(ScalarSizeInBits);
8070 Zeroable[i] = (Val == 0);
8075 // If the BUILD_VECTOR has more elements then all the (smaller) source
8076 // elements must be UNDEF or ZERO.
8077 if ((V.getNumOperands() % Size) == 0) {
8078 int Scale = V->getNumOperands() / Size;
8079 bool AllZeroable = true;
8080 for (int j = 0; j < Scale; ++j) {
8081 SDValue Op = V.getOperand((M * Scale) + j);
8082 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8084 Zeroable[i] = AllZeroable;
8092 // The Shuffle result is as follow:
8093 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8094 // Each Zeroable's element correspond to a particular Mask's element.
8095 // As described in computeZeroableShuffleElements function.
8097 // The function looks for a sub-mask that the nonzero elements are in
8098 // increasing order. If such sub-mask exist. The function returns true.
8099 static bool isNonZeroElementsInOrder(const SmallBitVector Zeroable,
8100 ArrayRef<int> Mask,const EVT &VectorType,
8101 bool &IsZeroSideLeft) {
8102 int NextElement = -1;
8103 // Check if the Mask's nonzero elements are in increasing order.
8104 for (int i = 0, e = Zeroable.size(); i < e; i++) {
8105 // Checks if the mask's zeros elements are built from only zeros.
8110 // Find the lowest non zero element
8111 if (NextElement == -1) {
8112 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8113 IsZeroSideLeft = NextElement != 0;
8115 // Exit if the mask's non zero elements are not in increasing order.
8116 if (NextElement != Mask[i])
8123 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8124 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8125 ArrayRef<int> Mask, SDValue V1,
8127 const SmallBitVector &Zeroable,
8128 const X86Subtarget &Subtarget,
8129 SelectionDAG &DAG) {
8130 int Size = Mask.size();
8131 int LaneSize = 128 / VT.getScalarSizeInBits();
8132 const int NumBytes = VT.getSizeInBits() / 8;
8133 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8135 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8136 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8137 (Subtarget.hasBWI() && VT.is512BitVector()));
8139 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8140 // Sign bit set in i8 mask means zero element.
8141 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8144 for (int i = 0; i < NumBytes; ++i) {
8145 int M = Mask[i / NumEltBytes];
8147 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8150 if (Zeroable[i / NumEltBytes]) {
8151 PSHUFBMask[i] = ZeroMask;
8155 // We can only use a single input of V1 or V2.
8156 SDValue SrcV = (M >= Size ? V2 : V1);
8162 // PSHUFB can't cross lanes, ensure this doesn't happen.
8163 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8167 M = M * NumEltBytes + (i % NumEltBytes);
8168 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8170 assert(V && "Failed to find a source input");
8172 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8173 return DAG.getBitcast(
8174 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8175 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8178 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8179 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8182 // Function convertBitVectorToUnsigned - The function gets SmallBitVector
8183 // as argument and convert him to unsigned.
8184 // The output of the function is not(zeroable)
8185 static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
8186 unsigned convertBit = 0;
8187 for (int i = 0, e = Zeroable.size(); i < e; i++)
8188 convertBit |= !(Zeroable[i]) << i;
8192 // X86 has dedicated shuffle that can be lowered to VEXPAND
8193 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8194 const SmallBitVector &Zeroable,
8195 ArrayRef<int> Mask, SDValue &V1,
8196 SDValue &V2, SelectionDAG &DAG,
8197 const X86Subtarget &Subtarget) {
8198 bool IsLeftZeroSide = true;
8199 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8202 unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
8204 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8205 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8206 unsigned NumElts = VT.getVectorNumElements();
8207 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8208 "Unexpected number of vector elements");
8209 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8210 Subtarget, DAG, DL);
8211 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8212 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8213 return DAG.getNode(ISD::VSELECT, DL, VT, VMask,
8214 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8218 // X86 has dedicated unpack instructions that can handle specific blend
8219 // operations: UNPCKH and UNPCKL.
8220 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8221 ArrayRef<int> Mask, SDValue V1,
8222 SDValue V2, SelectionDAG &DAG) {
8223 SmallVector<int, 8> Unpckl;
8224 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8225 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8226 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8228 SmallVector<int, 8> Unpckh;
8229 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8230 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8231 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8233 // Commute and try again.
8234 ShuffleVectorSDNode::commuteMask(Unpckl);
8235 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8236 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8238 ShuffleVectorSDNode::commuteMask(Unpckh);
8239 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8240 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8245 /// \brief Try to emit a bitmask instruction for a shuffle.
8247 /// This handles cases where we can model a blend exactly as a bitmask due to
8248 /// one of the inputs being zeroable.
8249 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8250 SDValue V2, ArrayRef<int> Mask,
8251 const SmallBitVector &Zeroable,
8252 SelectionDAG &DAG) {
8253 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8254 MVT EltVT = VT.getVectorElementType();
8255 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8257 DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
8258 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8260 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8263 if (Mask[i] % Size != i)
8264 return SDValue(); // Not a blend.
8266 V = Mask[i] < Size ? V1 : V2;
8267 else if (V != (Mask[i] < Size ? V1 : V2))
8268 return SDValue(); // Can only let one input through the mask.
8270 VMaskOps[i] = AllOnes;
8273 return SDValue(); // No non-zeroable elements!
8275 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8276 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8279 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8281 /// This is used as a fallback approach when first class blend instructions are
8282 /// unavailable. Currently it is only suitable for integer vectors, but could
8283 /// be generalized for floating point vectors if desirable.
8284 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8285 SDValue V2, ArrayRef<int> Mask,
8286 SelectionDAG &DAG) {
8287 assert(VT.isInteger() && "Only supports integer vector types!");
8288 MVT EltVT = VT.getVectorElementType();
8289 int NumEltBits = EltVT.getSizeInBits();
8290 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8291 SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
8293 SmallVector<SDValue, 16> MaskOps;
8294 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8295 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8296 return SDValue(); // Shuffled input!
8297 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8300 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8301 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8302 // We have to cast V2 around.
8303 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8304 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8305 DAG.getBitcast(MaskVT, V1Mask),
8306 DAG.getBitcast(MaskVT, V2)));
8307 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8310 /// \brief Try to emit a blend instruction for a shuffle.
8312 /// This doesn't do any checks for the availability of instructions for blending
8313 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8314 /// be matched in the backend with the type given. What it does check for is
8315 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8316 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8317 SDValue V2, ArrayRef<int> Original,
8318 const SmallBitVector &Zeroable,
8319 const X86Subtarget &Subtarget,
8320 SelectionDAG &DAG) {
8321 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8322 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8323 SmallVector<int, 8> Mask(Original.begin(), Original.end());
8324 bool ForceV1Zero = false, ForceV2Zero = false;
8326 // Attempt to generate the binary blend mask. If an input is zero then
8327 // we can use any lane.
8328 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8329 unsigned BlendMask = 0;
8330 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8336 if (M == i + Size) {
8337 BlendMask |= 1u << i;
8348 BlendMask |= 1u << i;
8353 return SDValue(); // Shuffled input!
8356 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8358 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8360 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8362 auto ScaleBlendMask = [](unsigned BlendMask, int Size, int Scale) {
8363 unsigned ScaledMask = 0;
8364 for (int i = 0; i != Size; ++i)
8365 if (BlendMask & (1u << i))
8366 for (int j = 0; j != Scale; ++j)
8367 ScaledMask |= 1u << (i * Scale + j);
8371 switch (VT.SimpleTy) {
8376 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8377 DAG.getConstant(BlendMask, DL, MVT::i8));
8381 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8385 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8386 // that instruction.
8387 if (Subtarget.hasAVX2()) {
8388 // Scale the blend by the number of 32-bit dwords per element.
8389 int Scale = VT.getScalarSizeInBits() / 32;
8390 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8391 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8392 V1 = DAG.getBitcast(BlendVT, V1);
8393 V2 = DAG.getBitcast(BlendVT, V2);
8394 return DAG.getBitcast(
8395 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8396 DAG.getConstant(BlendMask, DL, MVT::i8)));
8400 // For integer shuffles we need to expand the mask and cast the inputs to
8401 // v8i16s prior to blending.
8402 int Scale = 8 / VT.getVectorNumElements();
8403 BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
8404 V1 = DAG.getBitcast(MVT::v8i16, V1);
8405 V2 = DAG.getBitcast(MVT::v8i16, V2);
8406 return DAG.getBitcast(VT,
8407 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8408 DAG.getConstant(BlendMask, DL, MVT::i8)));
8412 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8413 SmallVector<int, 8> RepeatedMask;
8414 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8415 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8416 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8418 for (int i = 0; i < 8; ++i)
8419 if (RepeatedMask[i] >= 8)
8420 BlendMask |= 1u << i;
8421 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8422 DAG.getConstant(BlendMask, DL, MVT::i8));
8428 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8429 "256-bit byte-blends require AVX2 support!");
8431 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8432 if (SDValue Masked =
8433 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8436 // Scale the blend by the number of bytes per element.
8437 int Scale = VT.getScalarSizeInBits() / 8;
8439 // This form of blend is always done on bytes. Compute the byte vector
8441 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8443 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8444 // mix of LLVM's code generator and the x86 backend. We tell the code
8445 // generator that boolean values in the elements of an x86 vector register
8446 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8447 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8448 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8449 // of the element (the remaining are ignored) and 0 in that high bit would
8450 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8451 // the LLVM model for boolean values in vector elements gets the relevant
8452 // bit set, it is set backwards and over constrained relative to x86's
8454 SmallVector<SDValue, 32> VSELECTMask;
8455 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8456 for (int j = 0; j < Scale; ++j)
8457 VSELECTMask.push_back(
8458 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8459 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8462 V1 = DAG.getBitcast(BlendVT, V1);
8463 V2 = DAG.getBitcast(BlendVT, V2);
8464 return DAG.getBitcast(
8465 VT, DAG.getNode(ISD::VSELECT, DL, BlendVT,
8466 DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2));
8470 llvm_unreachable("Not a supported integer vector type!");
8474 /// \brief Try to lower as a blend of elements from two inputs followed by
8475 /// a single-input permutation.
8477 /// This matches the pattern where we can blend elements from two inputs and
8478 /// then reduce the shuffle to a single-input permutation.
8479 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8480 SDValue V1, SDValue V2,
8482 SelectionDAG &DAG) {
8483 // We build up the blend mask while checking whether a blend is a viable way
8484 // to reduce the shuffle.
8485 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8486 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8488 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8492 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8494 if (BlendMask[Mask[i] % Size] < 0)
8495 BlendMask[Mask[i] % Size] = Mask[i];
8496 else if (BlendMask[Mask[i] % Size] != Mask[i])
8497 return SDValue(); // Can't blend in the needed input!
8499 PermuteMask[i] = Mask[i] % Size;
8502 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8503 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
8506 /// \brief Generic routine to decompose a shuffle and blend into indepndent
8507 /// blends and permutes.
8509 /// This matches the extremely common pattern for handling combined
8510 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
8511 /// operations. It will try to pick the best arrangement of shuffles and
8513 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
8517 SelectionDAG &DAG) {
8518 // Shuffle the input elements into the desired positions in V1 and V2 and
8519 // blend them together.
8520 SmallVector<int, 32> V1Mask(Mask.size(), -1);
8521 SmallVector<int, 32> V2Mask(Mask.size(), -1);
8522 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8523 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8524 if (Mask[i] >= 0 && Mask[i] < Size) {
8525 V1Mask[i] = Mask[i];
8527 } else if (Mask[i] >= Size) {
8528 V2Mask[i] = Mask[i] - Size;
8529 BlendMask[i] = i + Size;
8532 // Try to lower with the simpler initial blend strategy unless one of the
8533 // input shuffles would be a no-op. We prefer to shuffle inputs as the
8534 // shuffle may be able to fold with a load or other benefit. However, when
8535 // we'll have to do 2x as many shuffles in order to achieve this, blending
8536 // first is a better strategy.
8537 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
8538 if (SDValue BlendPerm =
8539 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
8542 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
8543 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
8544 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
8547 /// \brief Try to lower a vector shuffle as a rotation.
8549 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
8550 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
8551 ArrayRef<int> Mask) {
8552 int NumElts = Mask.size();
8554 // We need to detect various ways of spelling a rotation:
8555 // [11, 12, 13, 14, 15, 0, 1, 2]
8556 // [-1, 12, 13, 14, -1, -1, 1, -1]
8557 // [-1, -1, -1, -1, -1, -1, 1, 2]
8558 // [ 3, 4, 5, 6, 7, 8, 9, 10]
8559 // [-1, 4, 5, 6, -1, -1, 9, -1]
8560 // [-1, 4, 5, 6, -1, -1, -1, -1]
8563 for (int i = 0; i < NumElts; ++i) {
8565 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
8566 "Unexpected mask index.");
8570 // Determine where a rotated vector would have started.
8571 int StartIdx = i - (M % NumElts);
8573 // The identity rotation isn't interesting, stop.
8576 // If we found the tail of a vector the rotation must be the missing
8577 // front. If we found the head of a vector, it must be how much of the
8579 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
8582 Rotation = CandidateRotation;
8583 else if (Rotation != CandidateRotation)
8584 // The rotations don't match, so we can't match this mask.
8587 // Compute which value this mask is pointing at.
8588 SDValue MaskV = M < NumElts ? V1 : V2;
8590 // Compute which of the two target values this index should be assigned
8591 // to. This reflects whether the high elements are remaining or the low
8592 // elements are remaining.
8593 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
8595 // Either set up this value if we've not encountered it before, or check
8596 // that it remains consistent.
8599 else if (TargetV != MaskV)
8600 // This may be a rotation, but it pulls from the inputs in some
8601 // unsupported interleaving.
8605 // Check that we successfully analyzed the mask, and normalize the results.
8606 assert(Rotation != 0 && "Failed to locate a viable rotation!");
8607 assert((Lo || Hi) && "Failed to find a rotated input vector!");
8619 /// \brief Try to lower a vector shuffle as a byte rotation.
8621 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
8622 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
8623 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
8624 /// try to generically lower a vector shuffle through such an pattern. It
8625 /// does not check for the profitability of lowering either as PALIGNR or
8626 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
8627 /// This matches shuffle vectors that look like:
8629 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
8631 /// Essentially it concatenates V1 and V2, shifts right by some number of
8632 /// elements, and takes the low elements as the result. Note that while this is
8633 /// specified as a *right shift* because x86 is little-endian, it is a *left
8634 /// rotate* of the vector lanes.
8635 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
8636 ArrayRef<int> Mask) {
8637 // Don't accept any shuffles with zero elements.
8638 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
8641 // PALIGNR works on 128-bit lanes.
8642 SmallVector<int, 16> RepeatedMask;
8643 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
8646 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
8650 // PALIGNR rotates bytes, so we need to scale the
8651 // rotation based on how many bytes are in the vector lane.
8652 int NumElts = RepeatedMask.size();
8653 int Scale = 16 / NumElts;
8654 return Rotation * Scale;
8657 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
8658 SDValue V1, SDValue V2,
8660 const X86Subtarget &Subtarget,
8661 SelectionDAG &DAG) {
8662 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
8664 SDValue Lo = V1, Hi = V2;
8665 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
8666 if (ByteRotation <= 0)
8669 // Cast the inputs to i8 vector of correct length to match PALIGNR or
8671 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8672 Lo = DAG.getBitcast(ByteVT, Lo);
8673 Hi = DAG.getBitcast(ByteVT, Hi);
8675 // SSSE3 targets can use the palignr instruction.
8676 if (Subtarget.hasSSSE3()) {
8677 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
8678 "512-bit PALIGNR requires BWI instructions");
8679 return DAG.getBitcast(
8680 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
8681 DAG.getConstant(ByteRotation, DL, MVT::i8)));
8684 assert(VT.is128BitVector() &&
8685 "Rotate-based lowering only supports 128-bit lowering!");
8686 assert(Mask.size() <= 16 &&
8687 "Can shuffle at most 16 bytes in a 128-bit vector!");
8688 assert(ByteVT == MVT::v16i8 &&
8689 "SSE2 rotate lowering only needed for v16i8!");
8691 // Default SSE2 implementation
8692 int LoByteShift = 16 - ByteRotation;
8693 int HiByteShift = ByteRotation;
8695 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
8696 DAG.getConstant(LoByteShift, DL, MVT::i8));
8697 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
8698 DAG.getConstant(HiByteShift, DL, MVT::i8));
8699 return DAG.getBitcast(VT,
8700 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
8703 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
8705 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
8706 /// rotation of the concatenation of two vectors; This routine will
8707 /// try to generically lower a vector shuffle through such an pattern.
8709 /// Essentially it concatenates V1 and V2, shifts right by some number of
8710 /// elements, and takes the low elements as the result. Note that while this is
8711 /// specified as a *right shift* because x86 is little-endian, it is a *left
8712 /// rotate* of the vector lanes.
8713 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
8714 SDValue V1, SDValue V2,
8716 const X86Subtarget &Subtarget,
8717 SelectionDAG &DAG) {
8718 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
8719 "Only 32-bit and 64-bit elements are supported!");
8721 // 128/256-bit vectors are only supported with VLX.
8722 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
8723 && "VLX required for 128/256-bit vectors");
8725 SDValue Lo = V1, Hi = V2;
8726 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
8730 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
8731 DAG.getConstant(Rotation, DL, MVT::i8));
8734 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
8736 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
8737 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
8738 /// matches elements from one of the input vectors shuffled to the left or
8739 /// right with zeroable elements 'shifted in'. It handles both the strictly
8740 /// bit-wise element shifts and the byte shift across an entire 128-bit double
8743 /// PSHL : (little-endian) left bit shift.
8744 /// [ zz, 0, zz, 2 ]
8745 /// [ -1, 4, zz, -1 ]
8746 /// PSRL : (little-endian) right bit shift.
8748 /// [ -1, -1, 7, zz]
8749 /// PSLLDQ : (little-endian) left byte shift
8750 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
8751 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
8752 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
8753 /// PSRLDQ : (little-endian) right byte shift
8754 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
8755 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
8756 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
8757 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
8758 unsigned ScalarSizeInBits,
8759 ArrayRef<int> Mask, int MaskOffset,
8760 const SmallBitVector &Zeroable,
8761 const X86Subtarget &Subtarget) {
8762 int Size = Mask.size();
8763 unsigned SizeInBits = Size * ScalarSizeInBits;
8765 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
8766 for (int i = 0; i < Size; i += Scale)
8767 for (int j = 0; j < Shift; ++j)
8768 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
8774 auto MatchShift = [&](int Shift, int Scale, bool Left) {
8775 for (int i = 0; i != Size; i += Scale) {
8776 unsigned Pos = Left ? i + Shift : i;
8777 unsigned Low = Left ? i : i + Shift;
8778 unsigned Len = Scale - Shift;
8779 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
8783 int ShiftEltBits = ScalarSizeInBits * Scale;
8784 bool ByteShift = ShiftEltBits > 64;
8785 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
8786 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
8787 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
8789 // Normalize the scale for byte shifts to still produce an i64 element
8791 Scale = ByteShift ? Scale / 2 : Scale;
8793 // We need to round trip through the appropriate type for the shift.
8794 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
8795 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
8796 : MVT::getVectorVT(ShiftSVT, Size / Scale);
8797 return (int)ShiftAmt;
8800 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
8801 // keep doubling the size of the integer elements up to that. We can
8802 // then shift the elements of the integer vector by whole multiples of
8803 // their width within the elements of the larger integer vector. Test each
8804 // multiple to see if we can find a match with the moved element indices
8805 // and that the shifted in elements are all zeroable.
8806 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
8807 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
8808 for (int Shift = 1; Shift != Scale; ++Shift)
8809 for (bool Left : {true, false})
8810 if (CheckZeros(Shift, Scale, Left)) {
8811 int ShiftAmt = MatchShift(Shift, Scale, Left);
8820 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
8821 SDValue V2, ArrayRef<int> Mask,
8822 const SmallBitVector &Zeroable,
8823 const X86Subtarget &Subtarget,
8824 SelectionDAG &DAG) {
8825 int Size = Mask.size();
8826 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8832 // Try to match shuffle against V1 shift.
8833 int ShiftAmt = matchVectorShuffleAsShift(
8834 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
8836 // If V1 failed, try to match shuffle against V2 shift.
8839 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
8840 Mask, Size, Zeroable, Subtarget);
8847 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
8848 "Illegal integer vector type");
8849 V = DAG.getBitcast(ShiftVT, V);
8850 V = DAG.getNode(Opcode, DL, ShiftVT, V,
8851 DAG.getConstant(ShiftAmt, DL, MVT::i8));
8852 return DAG.getBitcast(VT, V);
8855 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
8856 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
8857 SDValue V2, ArrayRef<int> Mask,
8858 const SmallBitVector &Zeroable,
8859 SelectionDAG &DAG) {
8860 int Size = Mask.size();
8861 int HalfSize = Size / 2;
8862 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
8863 assert(!Zeroable.all() && "Fully zeroable shuffle mask");
8865 // Upper half must be undefined.
8866 if (!isUndefInRange(Mask, HalfSize, HalfSize))
8869 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
8870 // Remainder of lower half result is zero and upper half is all undef.
8871 auto LowerAsEXTRQ = [&]() {
8872 // Determine the extraction length from the part of the
8873 // lower half that isn't zeroable.
8875 for (; Len > 0; --Len)
8876 if (!Zeroable[Len - 1])
8878 assert(Len > 0 && "Zeroable shuffle mask");
8880 // Attempt to match first Len sequential elements from the lower half.
8883 for (int i = 0; i != Len; ++i) {
8887 SDValue &V = (M < Size ? V1 : V2);
8890 // The extracted elements must start at a valid index and all mask
8891 // elements must be in the lower half.
8892 if (i > M || M >= HalfSize)
8895 if (Idx < 0 || (Src == V && Idx == (M - i))) {
8906 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
8907 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8908 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8909 return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src,
8910 DAG.getConstant(BitLen, DL, MVT::i8),
8911 DAG.getConstant(BitIdx, DL, MVT::i8));
8914 if (SDValue ExtrQ = LowerAsEXTRQ())
8917 // INSERTQ: Extract lowest Len elements from lower half of second source and
8918 // insert over first source, starting at Idx.
8919 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
8920 auto LowerAsInsertQ = [&]() {
8921 for (int Idx = 0; Idx != HalfSize; ++Idx) {
8924 // Attempt to match first source from mask before insertion point.
8925 if (isUndefInRange(Mask, 0, Idx)) {
8927 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
8929 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
8935 // Extend the extraction length looking to match both the insertion of
8936 // the second source and the remaining elements of the first.
8937 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
8942 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
8944 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
8950 // Match the remaining elements of the lower half.
8951 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
8953 } else if ((!Base || (Base == V1)) &&
8954 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
8956 } else if ((!Base || (Base == V2)) &&
8957 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
8964 // We may not have a base (first source) - this can safely be undefined.
8966 Base = DAG.getUNDEF(VT);
8968 int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
8969 int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
8970 return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert,
8971 DAG.getConstant(BitLen, DL, MVT::i8),
8972 DAG.getConstant(BitIdx, DL, MVT::i8));
8979 if (SDValue InsertQ = LowerAsInsertQ())
8985 /// \brief Lower a vector shuffle as a zero or any extension.
8987 /// Given a specific number of elements, element bit width, and extension
8988 /// stride, produce either a zero or any extension based on the available
8989 /// features of the subtarget. The extended elements are consecutive and
8990 /// begin and can start from an offseted element index in the input; to
8991 /// avoid excess shuffling the offset must either being in the bottom lane
8992 /// or at the start of a higher lane. All extended elements must be from
8994 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
8995 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
8996 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
8997 assert(Scale > 1 && "Need a scale to extend.");
8998 int EltBits = VT.getScalarSizeInBits();
8999 int NumElements = VT.getVectorNumElements();
9000 int NumEltsPerLane = 128 / EltBits;
9001 int OffsetLane = Offset / NumEltsPerLane;
9002 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9003 "Only 8, 16, and 32 bit elements can be extended.");
9004 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9005 assert(0 <= Offset && "Extension offset must be positive.");
9006 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9007 "Extension offset must be in the first lane or start an upper lane.");
9009 // Check that an index is in same lane as the base offset.
9010 auto SafeOffset = [&](int Idx) {
9011 return OffsetLane == (Idx / NumEltsPerLane);
9014 // Shift along an input so that the offset base moves to the first element.
9015 auto ShuffleOffset = [&](SDValue V) {
9019 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9020 for (int i = 0; i * Scale < NumElements; ++i) {
9021 int SrcIdx = i + Offset;
9022 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9024 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9027 // Found a valid zext mask! Try various lowering strategies based on the
9028 // input type and available ISA extensions.
9029 if (Subtarget.hasSSE41()) {
9030 // Not worth offseting 128-bit vectors if scale == 2, a pattern using
9031 // PUNPCK will catch this in a later shuffle match.
9032 if (Offset && Scale == 2 && VT.is128BitVector())
9034 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9035 NumElements / Scale);
9036 InputV = ShuffleOffset(InputV);
9038 // For 256-bit vectors, we only need the lower (128-bit) input half.
9039 // For 512-bit vectors, we only need the lower input half or quarter.
9040 if (VT.getSizeInBits() > 128)
9041 InputV = extractSubVector(InputV, 0, DAG, DL,
9042 std::max(128, (int)VT.getSizeInBits() / Scale));
9044 InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
9045 return DAG.getBitcast(VT, InputV);
9048 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9050 // For any extends we can cheat for larger element sizes and use shuffle
9051 // instructions that can fold with a load and/or copy.
9052 if (AnyExt && EltBits == 32) {
9053 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9055 return DAG.getBitcast(
9056 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9057 DAG.getBitcast(MVT::v4i32, InputV),
9058 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9060 if (AnyExt && EltBits == 16 && Scale > 2) {
9061 int PSHUFDMask[4] = {Offset / 2, -1,
9062 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9063 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9064 DAG.getBitcast(MVT::v4i32, InputV),
9065 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9066 int PSHUFWMask[4] = {1, -1, -1, -1};
9067 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9068 return DAG.getBitcast(
9069 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9070 DAG.getBitcast(MVT::v8i16, InputV),
9071 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9074 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9076 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9077 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9078 assert(VT.is128BitVector() && "Unexpected vector width!");
9080 int LoIdx = Offset * EltBits;
9081 SDValue Lo = DAG.getBitcast(
9082 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9083 DAG.getConstant(EltBits, DL, MVT::i8),
9084 DAG.getConstant(LoIdx, DL, MVT::i8)));
9086 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9087 !SafeOffset(Offset + 1))
9088 return DAG.getBitcast(VT, Lo);
9090 int HiIdx = (Offset + 1) * EltBits;
9091 SDValue Hi = DAG.getBitcast(
9092 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9093 DAG.getConstant(EltBits, DL, MVT::i8),
9094 DAG.getConstant(HiIdx, DL, MVT::i8)));
9095 return DAG.getBitcast(VT,
9096 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9099 // If this would require more than 2 unpack instructions to expand, use
9100 // pshufb when available. We can only use more than 2 unpack instructions
9101 // when zero extending i8 elements which also makes it easier to use pshufb.
9102 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9103 assert(NumElements == 16 && "Unexpected byte vector width!");
9104 SDValue PSHUFBMask[16];
9105 for (int i = 0; i < 16; ++i) {
9106 int Idx = Offset + (i / Scale);
9107 PSHUFBMask[i] = DAG.getConstant(
9108 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9110 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9111 return DAG.getBitcast(
9112 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9113 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9116 // If we are extending from an offset, ensure we start on a boundary that
9117 // we can unpack from.
9118 int AlignToUnpack = Offset % (NumElements / Scale);
9119 if (AlignToUnpack) {
9120 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9121 for (int i = AlignToUnpack; i < NumElements; ++i)
9122 ShMask[i - AlignToUnpack] = i;
9123 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9124 Offset -= AlignToUnpack;
9127 // Otherwise emit a sequence of unpacks.
9129 unsigned UnpackLoHi = X86ISD::UNPCKL;
9130 if (Offset >= (NumElements / 2)) {
9131 UnpackLoHi = X86ISD::UNPCKH;
9132 Offset -= (NumElements / 2);
9135 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9136 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9137 : getZeroVector(InputVT, Subtarget, DAG, DL);
9138 InputV = DAG.getBitcast(InputVT, InputV);
9139 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9143 } while (Scale > 1);
9144 return DAG.getBitcast(VT, InputV);
9147 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9149 /// This routine will try to do everything in its power to cleverly lower
9150 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9151 /// check for the profitability of this lowering, it tries to aggressively
9152 /// match this pattern. It will use all of the micro-architectural details it
9153 /// can to emit an efficient lowering. It handles both blends with all-zero
9154 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9155 /// masking out later).
9157 /// The reason we have dedicated lowering for zext-style shuffles is that they
9158 /// are both incredibly common and often quite performance sensitive.
9159 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9160 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9161 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9162 SelectionDAG &DAG) {
9163 int Bits = VT.getSizeInBits();
9164 int NumLanes = Bits / 128;
9165 int NumElements = VT.getVectorNumElements();
9166 int NumEltsPerLane = NumElements / NumLanes;
9167 assert(VT.getScalarSizeInBits() <= 32 &&
9168 "Exceeds 32-bit integer zero extension limit");
9169 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9171 // Define a helper function to check a particular ext-scale and lower to it if
9173 auto Lower = [&](int Scale) -> SDValue {
9178 for (int i = 0; i < NumElements; ++i) {
9181 continue; // Valid anywhere but doesn't tell us anything.
9182 if (i % Scale != 0) {
9183 // Each of the extended elements need to be zeroable.
9187 // We no longer are in the anyext case.
9192 // Each of the base elements needs to be consecutive indices into the
9193 // same input vector.
9194 SDValue V = M < NumElements ? V1 : V2;
9195 M = M % NumElements;
9198 Offset = M - (i / Scale);
9199 } else if (InputV != V)
9200 return SDValue(); // Flip-flopping inputs.
9202 // Offset must start in the lowest 128-bit lane or at the start of an
9204 // FIXME: Is it ever worth allowing a negative base offset?
9205 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9206 (Offset % NumEltsPerLane) == 0))
9209 // If we are offsetting, all referenced entries must come from the same
9211 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9214 if ((M % NumElements) != (Offset + (i / Scale)))
9215 return SDValue(); // Non-consecutive strided elements.
9219 // If we fail to find an input, we have a zero-shuffle which should always
9220 // have already been handled.
9221 // FIXME: Maybe handle this here in case during blending we end up with one?
9225 // If we are offsetting, don't extend if we only match a single input, we
9226 // can always do better by using a basic PSHUF or PUNPCK.
9227 if (Offset != 0 && Matches < 2)
9230 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9231 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9234 // The widest scale possible for extending is to a 64-bit integer.
9235 assert(Bits % 64 == 0 &&
9236 "The number of bits in a vector must be divisible by 64 on x86!");
9237 int NumExtElements = Bits / 64;
9239 // Each iteration, try extending the elements half as much, but into twice as
9241 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9242 assert(NumElements % NumExtElements == 0 &&
9243 "The input vector size must be divisible by the extended size.");
9244 if (SDValue V = Lower(NumElements / NumExtElements))
9248 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9252 // Returns one of the source operands if the shuffle can be reduced to a
9253 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9254 auto CanZExtLowHalf = [&]() {
9255 for (int i = NumElements / 2; i != NumElements; ++i)
9258 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9260 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9265 if (SDValue V = CanZExtLowHalf()) {
9266 V = DAG.getBitcast(MVT::v2i64, V);
9267 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9268 return DAG.getBitcast(VT, V);
9271 // No viable ext lowering found.
9275 /// \brief Try to get a scalar value for a specific element of a vector.
9277 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9278 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9279 SelectionDAG &DAG) {
9280 MVT VT = V.getSimpleValueType();
9281 MVT EltVT = VT.getVectorElementType();
9282 V = peekThroughBitcasts(V);
9284 // If the bitcasts shift the element size, we can't extract an equivalent
9286 MVT NewVT = V.getSimpleValueType();
9287 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9290 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9291 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9292 // Ensure the scalar operand is the same size as the destination.
9293 // FIXME: Add support for scalar truncation where possible.
9294 SDValue S = V.getOperand(Idx);
9295 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9296 return DAG.getBitcast(EltVT, S);
9302 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9304 /// This is particularly important because the set of instructions varies
9305 /// significantly based on whether the operand is a load or not.
9306 static bool isShuffleFoldableLoad(SDValue V) {
9307 V = peekThroughBitcasts(V);
9308 return ISD::isNON_EXTLoad(V.getNode());
9311 /// \brief Try to lower insertion of a single element into a zero vector.
9313 /// This is a common pattern that we have especially efficient patterns to lower
9314 /// across all subtarget feature sets.
9315 static SDValue lowerVectorShuffleAsElementInsertion(
9316 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9317 const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
9318 SelectionDAG &DAG) {
9320 MVT EltVT = VT.getVectorElementType();
9323 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9325 bool IsV1Zeroable = true;
9326 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9327 if (i != V2Index && !Zeroable[i]) {
9328 IsV1Zeroable = false;
9332 // Check for a single input from a SCALAR_TO_VECTOR node.
9333 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9334 // all the smarts here sunk into that routine. However, the current
9335 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9336 // vector shuffle lowering is dead.
9337 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9339 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9340 // We need to zext the scalar if it is smaller than an i32.
9341 V2S = DAG.getBitcast(EltVT, V2S);
9342 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9343 // Using zext to expand a narrow element won't work for non-zero
9348 // Zero-extend directly to i32.
9350 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9352 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9353 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9354 EltVT == MVT::i16) {
9355 // Either not inserting from the low element of the input or the input
9356 // element size is too small to use VZEXT_MOVL to clear the high bits.
9360 if (!IsV1Zeroable) {
9361 // If V1 can't be treated as a zero vector we have fewer options to lower
9362 // this. We can't support integer vectors or non-zero targets cheaply, and
9363 // the V1 elements can't be permuted in any way.
9364 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9365 if (!VT.isFloatingPoint() || V2Index != 0)
9367 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9368 V1Mask[V2Index] = -1;
9369 if (!isNoopShuffleMask(V1Mask))
9371 // This is essentially a special case blend operation, but if we have
9372 // general purpose blend operations, they are always faster. Bail and let
9373 // the rest of the lowering handle these as blends.
9374 if (Subtarget.hasSSE41())
9377 // Otherwise, use MOVSD or MOVSS.
9378 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9379 "Only two types of floating point element types to handle!");
9380 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9384 // This lowering only works for the low element with floating point vectors.
9385 if (VT.isFloatingPoint() && V2Index != 0)
9388 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9390 V2 = DAG.getBitcast(VT, V2);
9393 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9394 // the desired position. Otherwise it is more efficient to do a vector
9395 // shift left. We know that we can do a vector shift left because all
9396 // the inputs are zero.
9397 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9398 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9399 V2Shuffle[V2Index] = 0;
9400 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9402 V2 = DAG.getBitcast(MVT::v16i8, V2);
9404 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9405 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9406 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9407 DAG.getDataLayout(), VT)));
9408 V2 = DAG.getBitcast(VT, V2);
9414 /// Try to lower broadcast of a single - truncated - integer element,
9415 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9417 /// This assumes we have AVX2.
9418 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9419 SDValue V0, int BroadcastIdx,
9420 const X86Subtarget &Subtarget,
9421 SelectionDAG &DAG) {
9422 assert(Subtarget.hasAVX2() &&
9423 "We can only lower integer broadcasts with AVX2!");
9425 EVT EltVT = VT.getVectorElementType();
9426 EVT V0VT = V0.getValueType();
9428 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9429 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9431 EVT V0EltVT = V0VT.getVectorElementType();
9432 if (!V0EltVT.isInteger())
9435 const unsigned EltSize = EltVT.getSizeInBits();
9436 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9438 // This is only a truncation if the original element type is larger.
9439 if (V0EltSize <= EltSize)
9442 assert(((V0EltSize % EltSize) == 0) &&
9443 "Scalar type sizes must all be powers of 2 on x86!");
9445 const unsigned V0Opc = V0.getOpcode();
9446 const unsigned Scale = V0EltSize / EltSize;
9447 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9449 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9450 V0Opc != ISD::BUILD_VECTOR)
9453 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9455 // If we're extracting non-least-significant bits, shift so we can truncate.
9456 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9457 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9458 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9459 if (const int OffsetIdx = BroadcastIdx % Scale)
9460 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9461 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9463 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9464 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9467 /// \brief Try to lower broadcast of a single element.
9469 /// For convenience, this code also bundles all of the subtarget feature set
9470 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9471 /// a convenient way to factor it out.
9472 /// FIXME: This is very similar to LowerVectorBroadcast - can we merge them?
9473 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9474 SDValue V1, SDValue V2,
9476 const X86Subtarget &Subtarget,
9477 SelectionDAG &DAG) {
9478 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9479 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9480 (Subtarget.hasAVX2() && VT.isInteger())))
9483 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9484 // we can only broadcast from a register with AVX2.
9485 unsigned NumElts = Mask.size();
9486 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9487 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9489 // Check that the mask is a broadcast.
9490 int BroadcastIdx = -1;
9491 for (int i = 0; i != (int)NumElts; ++i) {
9492 SmallVector<int, 8> BroadcastMask(NumElts, i);
9493 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
9499 if (BroadcastIdx < 0)
9501 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
9502 "a sorted mask where the broadcast "
9505 // Go up the chain of (vector) values to find a scalar load that we can
9506 // combine with the broadcast.
9509 switch (V.getOpcode()) {
9510 case ISD::BITCAST: {
9511 SDValue VSrc = V.getOperand(0);
9512 MVT SrcVT = VSrc.getSimpleValueType();
9513 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
9518 case ISD::CONCAT_VECTORS: {
9519 int OperandSize = Mask.size() / V.getNumOperands();
9520 V = V.getOperand(BroadcastIdx / OperandSize);
9521 BroadcastIdx %= OperandSize;
9524 case ISD::INSERT_SUBVECTOR: {
9525 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
9526 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
9530 int BeginIdx = (int)ConstantIdx->getZExtValue();
9532 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
9533 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
9534 BroadcastIdx -= BeginIdx;
9545 // Check if this is a broadcast of a scalar. We special case lowering
9546 // for scalars so that we can more effectively fold with loads.
9547 // First, look through bitcast: if the original value has a larger element
9548 // type than the shuffle, the broadcast element is in essence truncated.
9549 // Make that explicit to ease folding.
9550 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
9551 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
9552 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
9553 return TruncBroadcast;
9555 MVT BroadcastVT = VT;
9557 // Peek through any bitcast (only useful for loads).
9558 SDValue BC = peekThroughBitcasts(V);
9560 // Also check the simpler case, where we can directly reuse the scalar.
9561 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9562 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
9563 V = V.getOperand(BroadcastIdx);
9565 // If we can't broadcast from a register, check that the input is a load.
9566 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
9568 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
9569 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9570 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
9571 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
9572 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
9575 // If we are broadcasting a load that is only used by the shuffle
9576 // then we can reduce the vector load to the broadcasted scalar load.
9577 LoadSDNode *Ld = cast<LoadSDNode>(BC);
9578 SDValue BaseAddr = Ld->getOperand(1);
9579 EVT SVT = BroadcastVT.getScalarType();
9580 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
9581 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
9582 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
9583 DAG.getMachineFunction().getMachineMemOperand(
9584 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
9586 // Make sure the newly-created LOAD is in the same position as Ld in
9587 // terms of dependency. We create a TokenFactor for Ld and V,
9588 // and update uses of Ld's output chain to use the TokenFactor.
9589 if (Ld->hasAnyUseOfValue(1)) {
9590 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
9591 SDValue(Ld, 1), SDValue(V.getNode(), 1));
9592 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
9593 DAG.UpdateNodeOperands(NewChain.getNode(), SDValue(Ld, 1),
9594 SDValue(V.getNode(), 1));
9596 } else if (!BroadcastFromReg) {
9597 // We can't broadcast from a vector register.
9599 } else if (BroadcastIdx != 0) {
9600 // We can only broadcast from the zero-element of a vector register,
9601 // but it can be advantageous to broadcast from the zero-element of a
9603 if (!VT.is256BitVector() && !VT.is512BitVector())
9606 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
9607 if (VT == MVT::v4f64 || VT == MVT::v4i64)
9610 // Only broadcast the zero-element of a 128-bit subvector.
9611 unsigned EltSize = VT.getScalarSizeInBits();
9612 if (((BroadcastIdx * EltSize) % 128) != 0)
9615 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
9616 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
9617 DAG.getIntPtrConstant(BroadcastIdx, DL));
9620 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
9621 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
9622 DAG.getBitcast(MVT::f64, V));
9624 // Bitcast back to the same scalar type as BroadcastVT.
9625 MVT SrcVT = V.getSimpleValueType();
9626 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
9627 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
9628 "Unexpected vector element size");
9629 if (SrcVT.isVector()) {
9630 unsigned NumSrcElts = SrcVT.getVectorNumElements();
9631 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
9633 SrcVT = BroadcastVT.getScalarType();
9635 V = DAG.getBitcast(SrcVT, V);
9638 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
9639 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
9640 V = DAG.getBitcast(MVT::f64, V);
9641 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
9642 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
9645 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
9648 // Check for whether we can use INSERTPS to perform the shuffle. We only use
9649 // INSERTPS when the V1 elements are already in the correct locations
9650 // because otherwise we can just always use two SHUFPS instructions which
9651 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
9652 // perform INSERTPS if a single V1 element is out of place and all V2
9653 // elements are zeroable.
9654 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
9655 unsigned &InsertPSMask,
9656 const SmallBitVector &Zeroable,
9658 SelectionDAG &DAG) {
9659 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
9660 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
9661 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
9663 // Attempt to match INSERTPS with one element from VA or VB being
9664 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
9666 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
9667 ArrayRef<int> CandidateMask) {
9669 int VADstIndex = -1;
9670 int VBDstIndex = -1;
9671 bool VAUsedInPlace = false;
9673 for (int i = 0; i < 4; ++i) {
9674 // Synthesize a zero mask from the zeroable elements (includes undefs).
9680 // Flag if we use any VA inputs in place.
9681 if (i == CandidateMask[i]) {
9682 VAUsedInPlace = true;
9686 // We can only insert a single non-zeroable element.
9687 if (VADstIndex >= 0 || VBDstIndex >= 0)
9690 if (CandidateMask[i] < 4) {
9691 // VA input out of place for insertion.
9694 // VB input for insertion.
9699 // Don't bother if we have no (non-zeroable) element for insertion.
9700 if (VADstIndex < 0 && VBDstIndex < 0)
9703 // Determine element insertion src/dst indices. The src index is from the
9704 // start of the inserted vector, not the start of the concatenated vector.
9705 unsigned VBSrcIndex = 0;
9706 if (VADstIndex >= 0) {
9707 // If we have a VA input out of place, we use VA as the V2 element
9708 // insertion and don't use the original V2 at all.
9709 VBSrcIndex = CandidateMask[VADstIndex];
9710 VBDstIndex = VADstIndex;
9713 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
9716 // If no V1 inputs are used in place, then the result is created only from
9717 // the zero mask and the V2 insertion - so remove V1 dependency.
9719 VA = DAG.getUNDEF(MVT::v4f32);
9721 // Update V1, V2 and InsertPSMask accordingly.
9725 // Insert the V2 element into the desired position.
9726 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
9727 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
9731 if (matchAsInsertPS(V1, V2, Mask))
9734 // Commute and try again.
9735 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
9736 ShuffleVectorSDNode::commuteMask(CommutedMask);
9737 if (matchAsInsertPS(V2, V1, CommutedMask))
9743 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
9744 SDValue V2, ArrayRef<int> Mask,
9745 const SmallBitVector &Zeroable,
9746 SelectionDAG &DAG) {
9747 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9748 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
9750 // Attempt to match the insertps pattern.
9751 unsigned InsertPSMask;
9752 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
9755 // Insert the V2 element into the desired position.
9756 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
9757 DAG.getConstant(InsertPSMask, DL, MVT::i8));
9760 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
9761 /// UNPCK instruction.
9763 /// This specifically targets cases where we end up with alternating between
9764 /// the two inputs, and so can permute them into something that feeds a single
9765 /// UNPCK instruction. Note that this routine only targets integer vectors
9766 /// because for floating point vectors we have a generalized SHUFPS lowering
9767 /// strategy that handles everything that doesn't *exactly* match an unpack,
9768 /// making this clever lowering unnecessary.
9769 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
9770 SDValue V1, SDValue V2,
9772 SelectionDAG &DAG) {
9773 assert(!VT.isFloatingPoint() &&
9774 "This routine only supports integer vectors.");
9775 assert(VT.is128BitVector() &&
9776 "This routine only works on 128-bit vectors.");
9777 assert(!V2.isUndef() &&
9778 "This routine should only be used when blending two inputs.");
9779 assert(Mask.size() >= 2 && "Single element masks are invalid.");
9781 int Size = Mask.size();
9784 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
9786 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
9788 bool UnpackLo = NumLoInputs >= NumHiInputs;
9790 auto TryUnpack = [&](int ScalarSize, int Scale) {
9791 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
9792 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
9794 for (int i = 0; i < Size; ++i) {
9798 // Each element of the unpack contains Scale elements from this mask.
9799 int UnpackIdx = i / Scale;
9801 // We only handle the case where V1 feeds the first slots of the unpack.
9802 // We rely on canonicalization to ensure this is the case.
9803 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
9806 // Setup the mask for this input. The indexing is tricky as we have to
9807 // handle the unpack stride.
9808 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
9809 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
9813 // If we will have to shuffle both inputs to use the unpack, check whether
9814 // we can just unpack first and shuffle the result. If so, skip this unpack.
9815 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
9816 !isNoopShuffleMask(V2Mask))
9819 // Shuffle the inputs into place.
9820 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9821 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9823 // Cast the inputs to the type we will use to unpack them.
9824 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
9825 V1 = DAG.getBitcast(UnpackVT, V1);
9826 V2 = DAG.getBitcast(UnpackVT, V2);
9828 // Unpack the inputs and cast the result back to the desired type.
9829 return DAG.getBitcast(
9830 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
9834 // We try each unpack from the largest to the smallest to try and find one
9835 // that fits this mask.
9836 int OrigScalarSize = VT.getScalarSizeInBits();
9837 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
9838 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
9841 // If none of the unpack-rooted lowerings worked (or were profitable) try an
9843 if (NumLoInputs == 0 || NumHiInputs == 0) {
9844 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
9845 "We have to have *some* inputs!");
9846 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
9848 // FIXME: We could consider the total complexity of the permute of each
9849 // possible unpacking. Or at the least we should consider how many
9850 // half-crossings are created.
9851 // FIXME: We could consider commuting the unpacks.
9853 SmallVector<int, 32> PermMask((unsigned)Size, -1);
9854 for (int i = 0; i < Size; ++i) {
9858 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
9861 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
9863 return DAG.getVectorShuffle(
9864 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
9866 DAG.getUNDEF(VT), PermMask);
9872 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
9874 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
9875 /// support for floating point shuffles but not integer shuffles. These
9876 /// instructions will incur a domain crossing penalty on some chips though so
9877 /// it is better to avoid lowering through this for integer vectors where
9879 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9880 const SmallBitVector &Zeroable,
9881 SDValue V1, SDValue V2,
9882 const X86Subtarget &Subtarget,
9883 SelectionDAG &DAG) {
9884 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9885 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
9886 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9889 // Check for being able to broadcast a single element.
9890 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9891 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
9894 // Straight shuffle of a single input vector. Simulate this by using the
9895 // single input as both of the "inputs" to this instruction..
9896 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
9898 if (Subtarget.hasAVX()) {
9899 // If we have AVX, we can use VPERMILPS which will allow folding a load
9900 // into the shuffle.
9901 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
9902 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9906 X86ISD::SHUFP, DL, MVT::v2f64,
9907 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9908 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
9909 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9911 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
9912 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
9914 // If we have a single input, insert that into V1 if we can do so cheaply.
9915 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
9916 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9917 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
9919 // Try inverting the insertion since for v2 masks it is easy to do and we
9920 // can't reliably sort the mask one way or the other.
9921 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
9922 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
9923 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
9924 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
9928 // Try to use one of the special instruction patterns to handle two common
9929 // blend patterns if a zero-blend above didn't work.
9930 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
9931 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
9932 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
9933 // We can either use a special instruction to load over the low double or
9934 // to move just the low double.
9936 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
9938 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
9940 if (Subtarget.hasSSE41())
9941 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
9942 Zeroable, Subtarget, DAG))
9945 // Use dedicated unpack instructions for masks that match their pattern.
9947 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
9950 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
9951 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
9952 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
9955 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
9957 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
9958 /// the integer unit to minimize domain crossing penalties. However, for blends
9959 /// it falls back to the floating point shuffle operation with appropriate bit
9961 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
9962 const SmallBitVector &Zeroable,
9963 SDValue V1, SDValue V2,
9964 const X86Subtarget &Subtarget,
9965 SelectionDAG &DAG) {
9966 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9967 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
9968 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
9971 // Check for being able to broadcast a single element.
9972 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
9973 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
9976 // Straight shuffle of a single input vector. For everything from SSE2
9977 // onward this has a single fast instruction with no scary immediates.
9978 // We have to map the mask as it is actually a v4i32 shuffle instruction.
9979 V1 = DAG.getBitcast(MVT::v4i32, V1);
9980 int WidenedMask[4] = {
9981 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
9982 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
9983 return DAG.getBitcast(
9985 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
9986 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
9988 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
9989 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
9990 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
9991 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
9993 // If we have a blend of two same-type PACKUS operations and the blend aligns
9994 // with the low and high halves, we can just merge the PACKUS operations.
9995 // This is particularly important as it lets us merge shuffles that this
9996 // routine itself creates.
9997 auto GetPackNode = [](SDValue V) {
9998 V = peekThroughBitcasts(V);
9999 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10001 if (SDValue V1Pack = GetPackNode(V1))
10002 if (SDValue V2Pack = GetPackNode(V2)) {
10003 EVT PackVT = V1Pack.getValueType();
10004 if (PackVT == V2Pack.getValueType())
10005 return DAG.getBitcast(MVT::v2i64,
10006 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10007 Mask[0] == 0 ? V1Pack.getOperand(0)
10008 : V1Pack.getOperand(1),
10009 Mask[1] == 2 ? V2Pack.getOperand(0)
10010 : V2Pack.getOperand(1)));
10013 // Try to use shift instructions.
10014 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10015 Zeroable, Subtarget, DAG))
10018 // When loading a scalar and then shuffling it into a vector we can often do
10019 // the insertion cheaply.
10020 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10021 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10023 // Try inverting the insertion since for v2 masks it is easy to do and we
10024 // can't reliably sort the mask one way or the other.
10025 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10026 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10027 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10030 // We have different paths for blend lowering, but they all must use the
10031 // *exact* same predicate.
10032 bool IsBlendSupported = Subtarget.hasSSE41();
10033 if (IsBlendSupported)
10034 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10035 Zeroable, Subtarget, DAG))
10038 // Use dedicated unpack instructions for masks that match their pattern.
10040 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10043 // Try to use byte rotation instructions.
10044 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10045 if (Subtarget.hasSSSE3())
10046 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10047 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10050 // If we have direct support for blends, we should lower by decomposing into
10051 // a permute. That will be faster than the domain cross.
10052 if (IsBlendSupported)
10053 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10056 // We implement this with SHUFPD which is pretty lame because it will likely
10057 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10058 // However, all the alternatives are still more cycles and newer chips don't
10059 // have this problem. It would be really nice if x86 had better shuffles here.
10060 V1 = DAG.getBitcast(MVT::v2f64, V1);
10061 V2 = DAG.getBitcast(MVT::v2f64, V2);
10062 return DAG.getBitcast(MVT::v2i64,
10063 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10066 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10068 /// This is used to disable more specialized lowerings when the shufps lowering
10069 /// will happen to be efficient.
10070 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10071 // This routine only handles 128-bit shufps.
10072 assert(Mask.size() == 4 && "Unsupported mask size!");
10073 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10074 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10075 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10076 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10078 // To lower with a single SHUFPS we need to have the low half and high half
10079 // each requiring a single input.
10080 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10082 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10088 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10090 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10091 /// It makes no assumptions about whether this is the *best* lowering, it simply
10093 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10094 ArrayRef<int> Mask, SDValue V1,
10095 SDValue V2, SelectionDAG &DAG) {
10096 SDValue LowV = V1, HighV = V2;
10097 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10099 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10101 if (NumV2Elements == 1) {
10102 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10104 // Compute the index adjacent to V2Index and in the same half by toggling
10106 int V2AdjIndex = V2Index ^ 1;
10108 if (Mask[V2AdjIndex] < 0) {
10109 // Handles all the cases where we have a single V2 element and an undef.
10110 // This will only ever happen in the high lanes because we commute the
10111 // vector otherwise.
10113 std::swap(LowV, HighV);
10114 NewMask[V2Index] -= 4;
10116 // Handle the case where the V2 element ends up adjacent to a V1 element.
10117 // To make this work, blend them together as the first step.
10118 int V1Index = V2AdjIndex;
10119 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10120 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10121 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10123 // Now proceed to reconstruct the final blend as we have the necessary
10124 // high or low half formed.
10131 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10132 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10134 } else if (NumV2Elements == 2) {
10135 if (Mask[0] < 4 && Mask[1] < 4) {
10136 // Handle the easy case where we have V1 in the low lanes and V2 in the
10140 } else if (Mask[2] < 4 && Mask[3] < 4) {
10141 // We also handle the reversed case because this utility may get called
10142 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10143 // arrange things in the right direction.
10149 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10150 // trying to place elements directly, just blend them and set up the final
10151 // shuffle to place them.
10153 // The first two blend mask elements are for V1, the second two are for
10155 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10156 Mask[2] < 4 ? Mask[2] : Mask[3],
10157 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10158 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10159 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10160 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10162 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10165 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10166 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10167 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10168 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10171 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10172 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10175 /// \brief Lower 4-lane 32-bit floating point shuffles.
10177 /// Uses instructions exclusively from the floating point unit to minimize
10178 /// domain crossing penalties, as these are sufficient to implement all v4f32
10180 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10181 const SmallBitVector &Zeroable,
10182 SDValue V1, SDValue V2,
10183 const X86Subtarget &Subtarget,
10184 SelectionDAG &DAG) {
10185 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10186 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10187 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10189 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10191 if (NumV2Elements == 0) {
10192 // Check for being able to broadcast a single element.
10193 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10194 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10197 // Use even/odd duplicate instructions for masks that match their pattern.
10198 if (Subtarget.hasSSE3()) {
10199 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10200 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10201 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10202 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10205 if (Subtarget.hasAVX()) {
10206 // If we have AVX, we can use VPERMILPS which will allow folding a load
10207 // into the shuffle.
10208 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10209 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10212 // Otherwise, use a straight shuffle of a single input vector. We pass the
10213 // input vector to both operands to simulate this with a SHUFPS.
10214 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10215 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10218 // There are special ways we can lower some single-element blends. However, we
10219 // have custom ways we can lower more complex single-element blends below that
10220 // we defer to if both this and BLENDPS fail to match, so restrict this to
10221 // when the V2 input is targeting element 0 of the mask -- that is the fast
10223 if (NumV2Elements == 1 && Mask[0] >= 4)
10224 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10225 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10228 if (Subtarget.hasSSE41()) {
10229 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10230 Zeroable, Subtarget, DAG))
10233 // Use INSERTPS if we can complete the shuffle efficiently.
10235 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10238 if (!isSingleSHUFPSMask(Mask))
10239 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10240 DL, MVT::v4f32, V1, V2, Mask, DAG))
10244 // Use low/high mov instructions.
10245 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10246 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10247 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10248 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10250 // Use dedicated unpack instructions for masks that match their pattern.
10252 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10255 // Otherwise fall back to a SHUFPS lowering strategy.
10256 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10259 /// \brief Lower 4-lane i32 vector shuffles.
10261 /// We try to handle these with integer-domain shuffles where we can, but for
10262 /// blends we use the floating point domain blend instructions.
10263 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10264 const SmallBitVector &Zeroable,
10265 SDValue V1, SDValue V2,
10266 const X86Subtarget &Subtarget,
10267 SelectionDAG &DAG) {
10268 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10269 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10270 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10272 // Whenever we can lower this as a zext, that instruction is strictly faster
10273 // than any alternative. It also allows us to fold memory operands into the
10274 // shuffle in many cases.
10275 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10276 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10279 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10281 if (NumV2Elements == 0) {
10282 // Check for being able to broadcast a single element.
10283 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10284 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10287 // Straight shuffle of a single input vector. For everything from SSE2
10288 // onward this has a single fast instruction with no scary immediates.
10289 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10290 // but we aren't actually going to use the UNPCK instruction because doing
10291 // so prevents folding a load into this instruction or making a copy.
10292 const int UnpackLoMask[] = {0, 0, 1, 1};
10293 const int UnpackHiMask[] = {2, 2, 3, 3};
10294 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10295 Mask = UnpackLoMask;
10296 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10297 Mask = UnpackHiMask;
10299 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10300 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10303 // Try to use shift instructions.
10304 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10305 Zeroable, Subtarget, DAG))
10308 // There are special ways we can lower some single-element blends.
10309 if (NumV2Elements == 1)
10310 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10311 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10314 // We have different paths for blend lowering, but they all must use the
10315 // *exact* same predicate.
10316 bool IsBlendSupported = Subtarget.hasSSE41();
10317 if (IsBlendSupported)
10318 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10319 Zeroable, Subtarget, DAG))
10322 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10326 // Use dedicated unpack instructions for masks that match their pattern.
10328 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10331 // Try to use byte rotation instructions.
10332 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10333 if (Subtarget.hasSSSE3())
10334 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10335 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10338 // Assume that a single SHUFPS is faster than an alternative sequence of
10339 // multiple instructions (even if the CPU has a domain penalty).
10340 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10341 if (!isSingleSHUFPSMask(Mask)) {
10342 // If we have direct support for blends, we should lower by decomposing into
10343 // a permute. That will be faster than the domain cross.
10344 if (IsBlendSupported)
10345 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10348 // Try to lower by permuting the inputs into an unpack instruction.
10349 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10350 DL, MVT::v4i32, V1, V2, Mask, DAG))
10354 // We implement this with SHUFPS because it can blend from two vectors.
10355 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10356 // up the inputs, bypassing domain shift penalties that we would encur if we
10357 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10359 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10360 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10361 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10362 return DAG.getBitcast(MVT::v4i32, ShufPS);
10365 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10366 /// shuffle lowering, and the most complex part.
10368 /// The lowering strategy is to try to form pairs of input lanes which are
10369 /// targeted at the same half of the final vector, and then use a dword shuffle
10370 /// to place them onto the right half, and finally unpack the paired lanes into
10371 /// their final position.
10373 /// The exact breakdown of how to form these dword pairs and align them on the
10374 /// correct sides is really tricky. See the comments within the function for
10375 /// more of the details.
10377 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10378 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10379 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10380 /// vector, form the analogous 128-bit 8-element Mask.
10381 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10382 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10383 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10384 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10385 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10387 assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
10388 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10389 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10391 SmallVector<int, 4> LoInputs;
10392 std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
10393 [](int M) { return M >= 0; });
10394 std::sort(LoInputs.begin(), LoInputs.end());
10395 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10396 SmallVector<int, 4> HiInputs;
10397 std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
10398 [](int M) { return M >= 0; });
10399 std::sort(HiInputs.begin(), HiInputs.end());
10400 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10402 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10403 int NumHToL = LoInputs.size() - NumLToL;
10405 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10406 int NumHToH = HiInputs.size() - NumLToH;
10407 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10408 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10409 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10410 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10412 // If we are splatting two values from one half - one to each half, then
10413 // we can shuffle that half so each is splatted to a dword, then splat those
10414 // to their respective halves.
10415 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10417 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10418 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10419 V = DAG.getNode(ShufWOp, DL, VT, V,
10420 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10421 V = DAG.getBitcast(PSHUFDVT, V);
10422 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10423 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10424 return DAG.getBitcast(VT, V);
10427 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10428 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10429 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10430 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10432 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10433 // such inputs we can swap two of the dwords across the half mark and end up
10434 // with <=2 inputs to each half in each half. Once there, we can fall through
10435 // to the generic code below. For example:
10437 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10438 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10440 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10441 // and an existing 2-into-2 on the other half. In this case we may have to
10442 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10443 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10444 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10445 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10446 // half than the one we target for fixing) will be fixed when we re-enter this
10447 // path. We will also combine away any sequence of PSHUFD instructions that
10448 // result into a single instruction. Here is an example of the tricky case:
10450 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10451 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10453 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10455 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10456 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10458 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10459 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10461 // The result is fine to be handled by the generic logic.
10462 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10463 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10464 int AOffset, int BOffset) {
10465 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10466 "Must call this with A having 3 or 1 inputs from the A half.");
10467 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10468 "Must call this with B having 1 or 3 inputs from the B half.");
10469 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10470 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10472 bool ThreeAInputs = AToAInputs.size() == 3;
10474 // Compute the index of dword with only one word among the three inputs in
10475 // a half by taking the sum of the half with three inputs and subtracting
10476 // the sum of the actual three inputs. The difference is the remaining
10478 int ADWord, BDWord;
10479 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10480 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10481 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10482 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10483 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10484 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10485 int TripleNonInputIdx =
10486 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10487 TripleDWord = TripleNonInputIdx / 2;
10489 // We use xor with one to compute the adjacent DWord to whichever one the
10491 OneInputDWord = (OneInput / 2) ^ 1;
10493 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
10494 // and BToA inputs. If there is also such a problem with the BToB and AToB
10495 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
10496 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
10497 // is essential that we don't *create* a 3<-1 as then we might oscillate.
10498 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
10499 // Compute how many inputs will be flipped by swapping these DWords. We
10501 // to balance this to ensure we don't form a 3-1 shuffle in the other
10503 int NumFlippedAToBInputs =
10504 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
10505 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
10506 int NumFlippedBToBInputs =
10507 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
10508 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
10509 if ((NumFlippedAToBInputs == 1 &&
10510 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
10511 (NumFlippedBToBInputs == 1 &&
10512 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
10513 // We choose whether to fix the A half or B half based on whether that
10514 // half has zero flipped inputs. At zero, we may not be able to fix it
10515 // with that half. We also bias towards fixing the B half because that
10516 // will more commonly be the high half, and we have to bias one way.
10517 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
10518 ArrayRef<int> Inputs) {
10519 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
10520 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
10521 // Determine whether the free index is in the flipped dword or the
10522 // unflipped dword based on where the pinned index is. We use this bit
10523 // in an xor to conditionally select the adjacent dword.
10524 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
10525 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10526 if (IsFixIdxInput == IsFixFreeIdxInput)
10528 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
10529 assert(IsFixIdxInput != IsFixFreeIdxInput &&
10530 "We need to be changing the number of flipped inputs!");
10531 int PSHUFHalfMask[] = {0, 1, 2, 3};
10532 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
10533 V = DAG.getNode(FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
10535 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10537 for (int &M : Mask)
10538 if (M >= 0 && M == FixIdx)
10540 else if (M >= 0 && M == FixFreeIdx)
10543 if (NumFlippedBToBInputs != 0) {
10545 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
10546 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
10548 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
10549 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
10550 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
10555 int PSHUFDMask[] = {0, 1, 2, 3};
10556 PSHUFDMask[ADWord] = BDWord;
10557 PSHUFDMask[BDWord] = ADWord;
10558 V = DAG.getBitcast(
10560 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10561 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10563 // Adjust the mask to match the new locations of A and B.
10564 for (int &M : Mask)
10565 if (M >= 0 && M/2 == ADWord)
10566 M = 2 * BDWord + M % 2;
10567 else if (M >= 0 && M/2 == BDWord)
10568 M = 2 * ADWord + M % 2;
10570 // Recurse back into this routine to re-compute state now that this isn't
10571 // a 3 and 1 problem.
10572 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
10575 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
10576 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
10577 else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
10578 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
10580 // At this point there are at most two inputs to the low and high halves from
10581 // each half. That means the inputs can always be grouped into dwords and
10582 // those dwords can then be moved to the correct half with a dword shuffle.
10583 // We use at most one low and one high word shuffle to collect these paired
10584 // inputs into dwords, and finally a dword shuffle to place them.
10585 int PSHUFLMask[4] = {-1, -1, -1, -1};
10586 int PSHUFHMask[4] = {-1, -1, -1, -1};
10587 int PSHUFDMask[4] = {-1, -1, -1, -1};
10589 // First fix the masks for all the inputs that are staying in their
10590 // original halves. This will then dictate the targets of the cross-half
10592 auto fixInPlaceInputs =
10593 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
10594 MutableArrayRef<int> SourceHalfMask,
10595 MutableArrayRef<int> HalfMask, int HalfOffset) {
10596 if (InPlaceInputs.empty())
10598 if (InPlaceInputs.size() == 1) {
10599 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10600 InPlaceInputs[0] - HalfOffset;
10601 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
10604 if (IncomingInputs.empty()) {
10605 // Just fix all of the in place inputs.
10606 for (int Input : InPlaceInputs) {
10607 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
10608 PSHUFDMask[Input / 2] = Input / 2;
10613 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
10614 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
10615 InPlaceInputs[0] - HalfOffset;
10616 // Put the second input next to the first so that they are packed into
10617 // a dword. We find the adjacent index by toggling the low bit.
10618 int AdjIndex = InPlaceInputs[0] ^ 1;
10619 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
10620 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
10621 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
10623 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
10624 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
10626 // Now gather the cross-half inputs and place them into a free dword of
10627 // their target half.
10628 // FIXME: This operation could almost certainly be simplified dramatically to
10629 // look more like the 3-1 fixing operation.
10630 auto moveInputsToRightHalf = [&PSHUFDMask](
10631 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
10632 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
10633 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
10635 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
10636 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
10638 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
10640 int LowWord = Word & ~1;
10641 int HighWord = Word | 1;
10642 return isWordClobbered(SourceHalfMask, LowWord) ||
10643 isWordClobbered(SourceHalfMask, HighWord);
10646 if (IncomingInputs.empty())
10649 if (ExistingInputs.empty()) {
10650 // Map any dwords with inputs from them into the right half.
10651 for (int Input : IncomingInputs) {
10652 // If the source half mask maps over the inputs, turn those into
10653 // swaps and use the swapped lane.
10654 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
10655 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
10656 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
10657 Input - SourceOffset;
10658 // We have to swap the uses in our half mask in one sweep.
10659 for (int &M : HalfMask)
10660 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
10662 else if (M == Input)
10663 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10665 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
10666 Input - SourceOffset &&
10667 "Previous placement doesn't match!");
10669 // Note that this correctly re-maps both when we do a swap and when
10670 // we observe the other side of the swap above. We rely on that to
10671 // avoid swapping the members of the input list directly.
10672 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
10675 // Map the input's dword into the correct half.
10676 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
10677 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
10679 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
10681 "Previous placement doesn't match!");
10684 // And just directly shift any other-half mask elements to be same-half
10685 // as we will have mirrored the dword containing the element into the
10686 // same position within that half.
10687 for (int &M : HalfMask)
10688 if (M >= SourceOffset && M < SourceOffset + 4) {
10689 M = M - SourceOffset + DestOffset;
10690 assert(M >= 0 && "This should never wrap below zero!");
10695 // Ensure we have the input in a viable dword of its current half. This
10696 // is particularly tricky because the original position may be clobbered
10697 // by inputs being moved and *staying* in that half.
10698 if (IncomingInputs.size() == 1) {
10699 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10700 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
10702 SourceHalfMask[InputFixed - SourceOffset] =
10703 IncomingInputs[0] - SourceOffset;
10704 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
10706 IncomingInputs[0] = InputFixed;
10708 } else if (IncomingInputs.size() == 2) {
10709 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
10710 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
10711 // We have two non-adjacent or clobbered inputs we need to extract from
10712 // the source half. To do this, we need to map them into some adjacent
10713 // dword slot in the source mask.
10714 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
10715 IncomingInputs[1] - SourceOffset};
10717 // If there is a free slot in the source half mask adjacent to one of
10718 // the inputs, place the other input in it. We use (Index XOR 1) to
10719 // compute an adjacent index.
10720 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
10721 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
10722 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
10723 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10724 InputsFixed[1] = InputsFixed[0] ^ 1;
10725 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
10726 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
10727 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
10728 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
10729 InputsFixed[0] = InputsFixed[1] ^ 1;
10730 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
10731 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
10732 // The two inputs are in the same DWord but it is clobbered and the
10733 // adjacent DWord isn't used at all. Move both inputs to the free
10735 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
10736 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
10737 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
10738 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
10740 // The only way we hit this point is if there is no clobbering
10741 // (because there are no off-half inputs to this half) and there is no
10742 // free slot adjacent to one of the inputs. In this case, we have to
10743 // swap an input with a non-input.
10744 for (int i = 0; i < 4; ++i)
10745 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
10746 "We can't handle any clobbers here!");
10747 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
10748 "Cannot have adjacent inputs here!");
10750 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
10751 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
10753 // We also have to update the final source mask in this case because
10754 // it may need to undo the above swap.
10755 for (int &M : FinalSourceHalfMask)
10756 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
10757 M = InputsFixed[1] + SourceOffset;
10758 else if (M == InputsFixed[1] + SourceOffset)
10759 M = (InputsFixed[0] ^ 1) + SourceOffset;
10761 InputsFixed[1] = InputsFixed[0] ^ 1;
10764 // Point everything at the fixed inputs.
10765 for (int &M : HalfMask)
10766 if (M == IncomingInputs[0])
10767 M = InputsFixed[0] + SourceOffset;
10768 else if (M == IncomingInputs[1])
10769 M = InputsFixed[1] + SourceOffset;
10771 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
10772 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
10775 llvm_unreachable("Unhandled input size!");
10778 // Now hoist the DWord down to the right half.
10779 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
10780 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
10781 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
10782 for (int &M : HalfMask)
10783 for (int Input : IncomingInputs)
10785 M = FreeDWord * 2 + Input % 2;
10787 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
10788 /*SourceOffset*/ 4, /*DestOffset*/ 0);
10789 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
10790 /*SourceOffset*/ 0, /*DestOffset*/ 4);
10792 // Now enact all the shuffles we've computed to move the inputs into their
10794 if (!isNoopShuffleMask(PSHUFLMask))
10795 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10796 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
10797 if (!isNoopShuffleMask(PSHUFHMask))
10798 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10799 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
10800 if (!isNoopShuffleMask(PSHUFDMask))
10801 V = DAG.getBitcast(
10803 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
10804 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
10806 // At this point, each half should contain all its inputs, and we can then
10807 // just shuffle them into their final position.
10808 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
10809 "Failed to lift all the high half inputs to the low mask!");
10810 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
10811 "Failed to lift all the low half inputs to the high mask!");
10813 // Do a half shuffle for the low mask.
10814 if (!isNoopShuffleMask(LoMask))
10815 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
10816 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
10818 // Do a half shuffle with the high mask after shifting its values down.
10819 for (int &M : HiMask)
10822 if (!isNoopShuffleMask(HiMask))
10823 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
10824 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
10829 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
10830 /// blend if only one input is used.
10831 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
10832 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
10833 const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
10835 SDValue V1Mask[16];
10836 SDValue V2Mask[16];
10840 int Size = Mask.size();
10841 int Scale = 16 / Size;
10842 for (int i = 0; i < 16; ++i) {
10843 if (Mask[i / Scale] < 0) {
10844 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
10846 const int ZeroMask = 0x80;
10847 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
10849 int V2Idx = Mask[i / Scale] < Size
10851 : (Mask[i / Scale] - Size) * Scale + i % Scale;
10852 if (Zeroable[i / Scale])
10853 V1Idx = V2Idx = ZeroMask;
10854 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
10855 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
10856 V1InUse |= (ZeroMask != V1Idx);
10857 V2InUse |= (ZeroMask != V2Idx);
10862 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10863 DAG.getBitcast(MVT::v16i8, V1),
10864 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
10866 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
10867 DAG.getBitcast(MVT::v16i8, V2),
10868 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
10870 // If we need shuffled inputs from both, blend the two.
10872 if (V1InUse && V2InUse)
10873 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
10875 V = V1InUse ? V1 : V2;
10877 // Cast the result back to the correct type.
10878 return DAG.getBitcast(VT, V);
10881 /// \brief Generic lowering of 8-lane i16 shuffles.
10883 /// This handles both single-input shuffles and combined shuffle/blends with
10884 /// two inputs. The single input shuffles are immediately delegated to
10885 /// a dedicated lowering routine.
10887 /// The blends are lowered in one of three fundamental ways. If there are few
10888 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
10889 /// of the input is significantly cheaper when lowered as an interleaving of
10890 /// the two inputs, try to interleave them. Otherwise, blend the low and high
10891 /// halves of the inputs separately (making them have relatively few inputs)
10892 /// and then concatenate them.
10893 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10894 const SmallBitVector &Zeroable,
10895 SDValue V1, SDValue V2,
10896 const X86Subtarget &Subtarget,
10897 SelectionDAG &DAG) {
10898 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10899 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
10900 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
10902 // Whenever we can lower this as a zext, that instruction is strictly faster
10903 // than any alternative.
10904 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10905 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10908 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
10910 if (NumV2Inputs == 0) {
10911 // Check for being able to broadcast a single element.
10912 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10913 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10916 // Try to use shift instructions.
10917 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
10918 Zeroable, Subtarget, DAG))
10921 // Use dedicated unpack instructions for masks that match their pattern.
10923 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10926 // Try to use byte rotation instructions.
10927 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
10928 Mask, Subtarget, DAG))
10931 // Make a copy of the mask so it can be modified.
10932 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
10933 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
10934 MutableMask, Subtarget,
10938 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
10939 "All single-input shuffles should be canonicalized to be V1-input "
10942 // Try to use shift instructions.
10943 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
10944 Zeroable, Subtarget, DAG))
10947 // See if we can use SSE4A Extraction / Insertion.
10948 if (Subtarget.hasSSE4A())
10949 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
10953 // There are special ways we can lower some single-element blends.
10954 if (NumV2Inputs == 1)
10955 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10956 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
10959 // We have different paths for blend lowering, but they all must use the
10960 // *exact* same predicate.
10961 bool IsBlendSupported = Subtarget.hasSSE41();
10962 if (IsBlendSupported)
10963 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
10964 Zeroable, Subtarget, DAG))
10967 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
10971 // Use dedicated unpack instructions for masks that match their pattern.
10973 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
10976 // Try to use byte rotation instructions.
10977 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10978 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
10981 if (SDValue BitBlend =
10982 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
10985 // Try to lower by permuting the inputs into an unpack instruction.
10986 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
10990 // If we can't directly blend but can use PSHUFB, that will be better as it
10991 // can both shuffle and set up the inefficient blend.
10992 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
10993 bool V1InUse, V2InUse;
10994 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
10995 Zeroable, DAG, V1InUse, V2InUse);
10998 // We can always bit-blend if we have to so the fallback strategy is to
10999 // decompose into single-input permutes and blends.
11000 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11004 /// \brief Check whether a compaction lowering can be done by dropping even
11005 /// elements and compute how many times even elements must be dropped.
11007 /// This handles shuffles which take every Nth element where N is a power of
11008 /// two. Example shuffle masks:
11010 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11011 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11012 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11013 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11014 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11015 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11017 /// Any of these lanes can of course be undef.
11019 /// This routine only supports N <= 3.
11020 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11023 /// \returns N above, or the number of times even elements must be dropped if
11024 /// there is such a number. Otherwise returns zero.
11025 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11026 bool IsSingleInput) {
11027 // The modulus for the shuffle vector entries is based on whether this is
11028 // a single input or not.
11029 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11030 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11031 "We should only be called with masks with a power-of-2 size!");
11033 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11035 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11036 // and 2^3 simultaneously. This is because we may have ambiguity with
11037 // partially undef inputs.
11038 bool ViableForN[3] = {true, true, true};
11040 for (int i = 0, e = Mask.size(); i < e; ++i) {
11041 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11046 bool IsAnyViable = false;
11047 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11048 if (ViableForN[j]) {
11049 uint64_t N = j + 1;
11051 // The shuffle mask must be equal to (i * 2^N) % M.
11052 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11053 IsAnyViable = true;
11055 ViableForN[j] = false;
11057 // Early exit if we exhaust the possible powers of two.
11062 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11066 // Return 0 as there is no viable power of two.
11070 /// \brief Generic lowering of v16i8 shuffles.
11072 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11073 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11074 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11075 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11077 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11078 const SmallBitVector &Zeroable,
11079 SDValue V1, SDValue V2,
11080 const X86Subtarget &Subtarget,
11081 SelectionDAG &DAG) {
11082 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11083 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11084 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11086 // Try to use shift instructions.
11087 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11088 Zeroable, Subtarget, DAG))
11091 // Try to use byte rotation instructions.
11092 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11093 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11096 // Try to use a zext lowering.
11097 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11098 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11101 // See if we can use SSE4A Extraction / Insertion.
11102 if (Subtarget.hasSSE4A())
11103 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11107 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11109 // For single-input shuffles, there are some nicer lowering tricks we can use.
11110 if (NumV2Elements == 0) {
11111 // Check for being able to broadcast a single element.
11112 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11113 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11116 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11117 // Notably, this handles splat and partial-splat shuffles more efficiently.
11118 // However, it only makes sense if the pre-duplication shuffle simplifies
11119 // things significantly. Currently, this means we need to be able to
11120 // express the pre-duplication shuffle as an i16 shuffle.
11122 // FIXME: We should check for other patterns which can be widened into an
11123 // i16 shuffle as well.
11124 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11125 for (int i = 0; i < 16; i += 2)
11126 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11131 auto tryToWidenViaDuplication = [&]() -> SDValue {
11132 if (!canWidenViaDuplication(Mask))
11134 SmallVector<int, 4> LoInputs;
11135 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
11136 [](int M) { return M >= 0 && M < 8; });
11137 std::sort(LoInputs.begin(), LoInputs.end());
11138 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11140 SmallVector<int, 4> HiInputs;
11141 std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
11142 [](int M) { return M >= 8; });
11143 std::sort(HiInputs.begin(), HiInputs.end());
11144 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11147 bool TargetLo = LoInputs.size() >= HiInputs.size();
11148 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11149 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11151 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11152 SmallDenseMap<int, int, 8> LaneMap;
11153 for (int I : InPlaceInputs) {
11154 PreDupI16Shuffle[I/2] = I/2;
11157 int j = TargetLo ? 0 : 4, je = j + 4;
11158 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11159 // Check if j is already a shuffle of this input. This happens when
11160 // there are two adjacent bytes after we move the low one.
11161 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11162 // If we haven't yet mapped the input, search for a slot into which
11164 while (j < je && PreDupI16Shuffle[j] >= 0)
11168 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11171 // Map this input with the i16 shuffle.
11172 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11175 // Update the lane map based on the mapping we ended up with.
11176 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11178 V1 = DAG.getBitcast(
11180 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11181 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11183 // Unpack the bytes to form the i16s that will be shuffled into place.
11184 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11185 MVT::v16i8, V1, V1);
11187 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11188 for (int i = 0; i < 16; ++i)
11189 if (Mask[i] >= 0) {
11190 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11191 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11192 if (PostDupI16Shuffle[i / 2] < 0)
11193 PostDupI16Shuffle[i / 2] = MappedMask;
11195 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11196 "Conflicting entrties in the original shuffle!");
11198 return DAG.getBitcast(
11200 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11201 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11203 if (SDValue V = tryToWidenViaDuplication())
11207 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11211 // Use dedicated unpack instructions for masks that match their pattern.
11213 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11216 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11217 // with PSHUFB. It is important to do this before we attempt to generate any
11218 // blends but after all of the single-input lowerings. If the single input
11219 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11220 // want to preserve that and we can DAG combine any longer sequences into
11221 // a PSHUFB in the end. But once we start blending from multiple inputs,
11222 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11223 // and there are *very* few patterns that would actually be faster than the
11224 // PSHUFB approach because of its ability to zero lanes.
11226 // FIXME: The only exceptions to the above are blends which are exact
11227 // interleavings with direct instructions supporting them. We currently don't
11228 // handle those well here.
11229 if (Subtarget.hasSSSE3()) {
11230 bool V1InUse = false;
11231 bool V2InUse = false;
11233 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11234 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11236 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11237 // do so. This avoids using them to handle blends-with-zero which is
11238 // important as a single pshufb is significantly faster for that.
11239 if (V1InUse && V2InUse) {
11240 if (Subtarget.hasSSE41())
11241 if (SDValue Blend = lowerVectorShuffleAsBlend(
11242 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11245 // We can use an unpack to do the blending rather than an or in some
11246 // cases. Even though the or may be (very minorly) more efficient, we
11247 // preference this lowering because there are common cases where part of
11248 // the complexity of the shuffles goes away when we do the final blend as
11250 // FIXME: It might be worth trying to detect if the unpack-feeding
11251 // shuffles will both be pshufb, in which case we shouldn't bother with
11253 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11254 DL, MVT::v16i8, V1, V2, Mask, DAG))
11261 // There are special ways we can lower some single-element blends.
11262 if (NumV2Elements == 1)
11263 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11264 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11267 if (SDValue BitBlend =
11268 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11271 // Check whether a compaction lowering can be done. This handles shuffles
11272 // which take every Nth element for some even N. See the helper function for
11275 // We special case these as they can be particularly efficiently handled with
11276 // the PACKUSB instruction on x86 and they show up in common patterns of
11277 // rearranging bytes to truncate wide elements.
11278 bool IsSingleInput = V2.isUndef();
11279 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11280 // NumEvenDrops is the power of two stride of the elements. Another way of
11281 // thinking about it is that we need to drop the even elements this many
11282 // times to get the original input.
11284 // First we need to zero all the dropped bytes.
11285 assert(NumEvenDrops <= 3 &&
11286 "No support for dropping even elements more than 3 times.");
11287 // We use the mask type to pick which bytes are preserved based on how many
11288 // elements are dropped.
11289 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11290 SDValue ByteClearMask = DAG.getBitcast(
11291 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11292 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11293 if (!IsSingleInput)
11294 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11296 // Now pack things back together.
11297 V1 = DAG.getBitcast(MVT::v8i16, V1);
11298 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11299 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11300 for (int i = 1; i < NumEvenDrops; ++i) {
11301 Result = DAG.getBitcast(MVT::v8i16, Result);
11302 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11308 // Handle multi-input cases by blending single-input shuffles.
11309 if (NumV2Elements > 0)
11310 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11313 // The fallback path for single-input shuffles widens this into two v8i16
11314 // vectors with unpacks, shuffles those, and then pulls them back together
11318 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11319 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11320 for (int i = 0; i < 16; ++i)
11322 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11324 SDValue VLoHalf, VHiHalf;
11325 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11326 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11328 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11329 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11330 // Use a mask to drop the high bytes.
11331 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11332 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11333 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11335 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11336 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11338 // Squash the masks to point directly into VLoHalf.
11339 for (int &M : LoBlendMask)
11342 for (int &M : HiBlendMask)
11346 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11347 // VHiHalf so that we can blend them as i16s.
11348 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11350 VLoHalf = DAG.getBitcast(
11351 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11352 VHiHalf = DAG.getBitcast(
11353 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11356 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11357 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11359 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11362 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11364 /// This routine breaks down the specific type of 128-bit shuffle and
11365 /// dispatches to the lowering routines accordingly.
11366 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11367 MVT VT, SDValue V1, SDValue V2,
11368 const SmallBitVector &Zeroable,
11369 const X86Subtarget &Subtarget,
11370 SelectionDAG &DAG) {
11371 switch (VT.SimpleTy) {
11373 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11375 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11377 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11379 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11381 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11383 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11386 llvm_unreachable("Unimplemented!");
11390 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11392 /// This routine just extracts two subvectors, shuffles them independently, and
11393 /// then concatenates them back together. This should work effectively with all
11394 /// AVX vector shuffle types.
11395 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11396 SDValue V2, ArrayRef<int> Mask,
11397 SelectionDAG &DAG) {
11398 assert(VT.getSizeInBits() >= 256 &&
11399 "Only for 256-bit or wider vector shuffles!");
11400 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11401 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11403 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11404 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11406 int NumElements = VT.getVectorNumElements();
11407 int SplitNumElements = NumElements / 2;
11408 MVT ScalarVT = VT.getVectorElementType();
11409 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11411 // Rather than splitting build-vectors, just build two narrower build
11412 // vectors. This helps shuffling with splats and zeros.
11413 auto SplitVector = [&](SDValue V) {
11414 V = peekThroughBitcasts(V);
11416 MVT OrigVT = V.getSimpleValueType();
11417 int OrigNumElements = OrigVT.getVectorNumElements();
11418 int OrigSplitNumElements = OrigNumElements / 2;
11419 MVT OrigScalarVT = OrigVT.getVectorElementType();
11420 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11424 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11426 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11427 DAG.getIntPtrConstant(0, DL));
11428 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11429 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11432 SmallVector<SDValue, 16> LoOps, HiOps;
11433 for (int i = 0; i < OrigSplitNumElements; ++i) {
11434 LoOps.push_back(BV->getOperand(i));
11435 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11437 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11438 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11440 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11441 DAG.getBitcast(SplitVT, HiV));
11444 SDValue LoV1, HiV1, LoV2, HiV2;
11445 std::tie(LoV1, HiV1) = SplitVector(V1);
11446 std::tie(LoV2, HiV2) = SplitVector(V2);
11448 // Now create two 4-way blends of these half-width vectors.
11449 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11450 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11451 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11452 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11453 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11454 for (int i = 0; i < SplitNumElements; ++i) {
11455 int M = HalfMask[i];
11456 if (M >= NumElements) {
11457 if (M >= NumElements + SplitNumElements)
11461 V2BlendMask[i] = M - NumElements;
11462 BlendMask[i] = SplitNumElements + i;
11463 } else if (M >= 0) {
11464 if (M >= SplitNumElements)
11468 V1BlendMask[i] = M;
11473 // Because the lowering happens after all combining takes place, we need to
11474 // manually combine these blend masks as much as possible so that we create
11475 // a minimal number of high-level vector shuffle nodes.
11477 // First try just blending the halves of V1 or V2.
11478 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11479 return DAG.getUNDEF(SplitVT);
11480 if (!UseLoV2 && !UseHiV2)
11481 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11482 if (!UseLoV1 && !UseHiV1)
11483 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11485 SDValue V1Blend, V2Blend;
11486 if (UseLoV1 && UseHiV1) {
11488 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11490 // We only use half of V1 so map the usage down into the final blend mask.
11491 V1Blend = UseLoV1 ? LoV1 : HiV1;
11492 for (int i = 0; i < SplitNumElements; ++i)
11493 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
11494 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
11496 if (UseLoV2 && UseHiV2) {
11498 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11500 // We only use half of V2 so map the usage down into the final blend mask.
11501 V2Blend = UseLoV2 ? LoV2 : HiV2;
11502 for (int i = 0; i < SplitNumElements; ++i)
11503 if (BlendMask[i] >= SplitNumElements)
11504 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
11506 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
11508 SDValue Lo = HalfBlend(LoMask);
11509 SDValue Hi = HalfBlend(HiMask);
11510 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
11513 /// \brief Either split a vector in halves or decompose the shuffles and the
11516 /// This is provided as a good fallback for many lowerings of non-single-input
11517 /// shuffles with more than one 128-bit lane. In those cases, we want to select
11518 /// between splitting the shuffle into 128-bit components and stitching those
11519 /// back together vs. extracting the single-input shuffles and blending those
11521 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
11522 SDValue V1, SDValue V2,
11523 ArrayRef<int> Mask,
11524 SelectionDAG &DAG) {
11525 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
11526 "shuffles as it could then recurse on itself.");
11527 int Size = Mask.size();
11529 // If this can be modeled as a broadcast of two elements followed by a blend,
11530 // prefer that lowering. This is especially important because broadcasts can
11531 // often fold with memory operands.
11532 auto DoBothBroadcast = [&] {
11533 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
11536 if (V2BroadcastIdx < 0)
11537 V2BroadcastIdx = M - Size;
11538 else if (M - Size != V2BroadcastIdx)
11540 } else if (M >= 0) {
11541 if (V1BroadcastIdx < 0)
11542 V1BroadcastIdx = M;
11543 else if (M != V1BroadcastIdx)
11548 if (DoBothBroadcast())
11549 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
11552 // If the inputs all stem from a single 128-bit lane of each input, then we
11553 // split them rather than blending because the split will decompose to
11554 // unusually few instructions.
11555 int LaneCount = VT.getSizeInBits() / 128;
11556 int LaneSize = Size / LaneCount;
11557 SmallBitVector LaneInputs[2];
11558 LaneInputs[0].resize(LaneCount, false);
11559 LaneInputs[1].resize(LaneCount, false);
11560 for (int i = 0; i < Size; ++i)
11562 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
11563 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
11564 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11566 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
11567 // that the decomposed single-input shuffles don't end up here.
11568 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
11571 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
11572 /// a permutation and blend of those lanes.
11574 /// This essentially blends the out-of-lane inputs to each lane into the lane
11575 /// from a permuted copy of the vector. This lowering strategy results in four
11576 /// instructions in the worst case for a single-input cross lane shuffle which
11577 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
11578 /// of. Special cases for each particular shuffle pattern should be handled
11579 /// prior to trying this lowering.
11580 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
11581 SDValue V1, SDValue V2,
11582 ArrayRef<int> Mask,
11583 SelectionDAG &DAG) {
11584 // FIXME: This should probably be generalized for 512-bit vectors as well.
11585 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
11586 int Size = Mask.size();
11587 int LaneSize = Size / 2;
11589 // If there are only inputs from one 128-bit lane, splitting will in fact be
11590 // less expensive. The flags track whether the given lane contains an element
11591 // that crosses to another lane.
11592 bool LaneCrossing[2] = {false, false};
11593 for (int i = 0; i < Size; ++i)
11594 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
11595 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
11596 if (!LaneCrossing[0] || !LaneCrossing[1])
11597 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
11599 assert(V2.isUndef() &&
11600 "This last part of this routine only works on single input shuffles");
11602 SmallVector<int, 32> FlippedBlendMask(Size);
11603 for (int i = 0; i < Size; ++i)
11604 FlippedBlendMask[i] =
11605 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
11607 : Mask[i] % LaneSize +
11608 (i / LaneSize) * LaneSize + Size);
11610 // Flip the vector, and blend the results which should now be in-lane. The
11611 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
11612 // 5 for the high source. The value 3 selects the high half of source 2 and
11613 // the value 2 selects the low half of source 2. We only use source 2 to
11614 // allow folding it into a memory operand.
11615 unsigned PERMMask = 3 | 2 << 4;
11616 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
11617 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
11618 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
11621 /// \brief Handle lowering 2-lane 128-bit shuffles.
11622 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11623 SDValue V2, ArrayRef<int> Mask,
11624 const SmallBitVector &Zeroable,
11625 const X86Subtarget &Subtarget,
11626 SelectionDAG &DAG) {
11627 SmallVector<int, 4> WidenedMask;
11628 if (!canWidenShuffleElements(Mask, WidenedMask))
11631 // TODO: If minimizing size and one of the inputs is a zero vector and the
11632 // the zero vector has only one use, we could use a VPERM2X128 to save the
11633 // instruction bytes needed to explicitly generate the zero vector.
11635 // Blends are faster and handle all the non-lane-crossing cases.
11636 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
11637 Zeroable, Subtarget, DAG))
11640 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
11641 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
11643 // If either input operand is a zero vector, use VPERM2X128 because its mask
11644 // allows us to replace the zero input with an implicit zero.
11645 if (!IsV1Zero && !IsV2Zero) {
11646 // Check for patterns which can be matched with a single insert of a 128-bit
11648 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
11649 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
11650 // With AVX2 we should use VPERMQ/VPERMPD to allow memory folding.
11651 if (Subtarget.hasAVX2() && V2.isUndef())
11654 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
11655 VT.getVectorNumElements() / 2);
11656 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
11657 DAG.getIntPtrConstant(0, DL));
11658 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
11659 OnlyUsesV1 ? V1 : V2,
11660 DAG.getIntPtrConstant(0, DL));
11661 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
11665 // Otherwise form a 128-bit permutation. After accounting for undefs,
11666 // convert the 64-bit shuffle mask selection values into 128-bit
11667 // selection bits by dividing the indexes by 2 and shifting into positions
11668 // defined by a vperm2*128 instruction's immediate control byte.
11670 // The immediate permute control byte looks like this:
11671 // [1:0] - select 128 bits from sources for low half of destination
11673 // [3] - zero low half of destination
11674 // [5:4] - select 128 bits from sources for high half of destination
11676 // [7] - zero high half of destination
11678 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
11679 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
11681 unsigned PermMask = MaskLO | (MaskHI << 4);
11683 // If either input is a zero vector, replace it with an undef input.
11684 // Shuffle mask values < 4 are selecting elements of V1.
11685 // Shuffle mask values >= 4 are selecting elements of V2.
11686 // Adjust each half of the permute mask by clearing the half that was
11687 // selecting the zero vector and setting the zero mask bit.
11689 V1 = DAG.getUNDEF(VT);
11691 PermMask = (PermMask & 0xf0) | 0x08;
11693 PermMask = (PermMask & 0x0f) | 0x80;
11696 V2 = DAG.getUNDEF(VT);
11698 PermMask = (PermMask & 0xf0) | 0x08;
11700 PermMask = (PermMask & 0x0f) | 0x80;
11703 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
11704 DAG.getConstant(PermMask, DL, MVT::i8));
11707 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
11708 /// shuffling each lane.
11710 /// This will only succeed when the result of fixing the 128-bit lanes results
11711 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
11712 /// each 128-bit lanes. This handles many cases where we can quickly blend away
11713 /// the lane crosses early and then use simpler shuffles within each lane.
11715 /// FIXME: It might be worthwhile at some point to support this without
11716 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
11717 /// in x86 only floating point has interesting non-repeating shuffles, and even
11718 /// those are still *marginally* more expensive.
11719 static SDValue lowerVectorShuffleByMerging128BitLanes(
11720 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11721 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11722 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
11724 int Size = Mask.size();
11725 int LaneSize = 128 / VT.getScalarSizeInBits();
11726 int NumLanes = Size / LaneSize;
11727 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
11729 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
11730 // check whether the in-128-bit lane shuffles share a repeating pattern.
11731 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
11732 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
11733 for (int i = 0; i < Size; ++i) {
11737 int j = i / LaneSize;
11739 if (Lanes[j] < 0) {
11740 // First entry we've seen for this lane.
11741 Lanes[j] = Mask[i] / LaneSize;
11742 } else if (Lanes[j] != Mask[i] / LaneSize) {
11743 // This doesn't match the lane selected previously!
11747 // Check that within each lane we have a consistent shuffle mask.
11748 int k = i % LaneSize;
11749 if (InLaneMask[k] < 0) {
11750 InLaneMask[k] = Mask[i] % LaneSize;
11751 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
11752 // This doesn't fit a repeating in-lane mask.
11757 // First shuffle the lanes into place.
11758 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
11759 VT.getSizeInBits() / 64);
11760 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
11761 for (int i = 0; i < NumLanes; ++i)
11762 if (Lanes[i] >= 0) {
11763 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
11764 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
11767 V1 = DAG.getBitcast(LaneVT, V1);
11768 V2 = DAG.getBitcast(LaneVT, V2);
11769 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
11771 // Cast it back to the type we actually want.
11772 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
11774 // Now do a simple shuffle that isn't lane crossing.
11775 SmallVector<int, 8> NewMask((unsigned)Size, -1);
11776 for (int i = 0; i < Size; ++i)
11778 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
11779 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
11780 "Must not introduce lane crosses at this point!");
11782 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
11785 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
11786 /// This allows for fast cases such as subvector extraction/insertion
11787 /// or shuffling smaller vector types which can lower more efficiently.
11788 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
11789 SDValue V1, SDValue V2,
11790 ArrayRef<int> Mask,
11791 const X86Subtarget &Subtarget,
11792 SelectionDAG &DAG) {
11793 assert(VT.is256BitVector() && "Expected 256-bit vector");
11795 unsigned NumElts = VT.getVectorNumElements();
11796 unsigned HalfNumElts = NumElts / 2;
11797 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
11799 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
11800 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
11801 if (!UndefLower && !UndefUpper)
11804 // Upper half is undef and lower half is whole upper subvector.
11805 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
11807 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
11808 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11809 DAG.getIntPtrConstant(HalfNumElts, DL));
11810 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11811 DAG.getIntPtrConstant(0, DL));
11814 // Lower half is undef and upper half is whole lower subvector.
11815 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
11817 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
11818 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
11819 DAG.getIntPtrConstant(0, DL));
11820 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
11821 DAG.getIntPtrConstant(HalfNumElts, DL));
11824 // If the shuffle only uses two of the four halves of the input operands,
11825 // then extract them and perform the 'half' shuffle at half width.
11826 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
11827 int HalfIdx1 = -1, HalfIdx2 = -1;
11828 SmallVector<int, 8> HalfMask(HalfNumElts);
11829 unsigned Offset = UndefLower ? HalfNumElts : 0;
11830 for (unsigned i = 0; i != HalfNumElts; ++i) {
11831 int M = Mask[i + Offset];
11837 // Determine which of the 4 half vectors this element is from.
11838 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
11839 int HalfIdx = M / HalfNumElts;
11841 // Determine the element index into its half vector source.
11842 int HalfElt = M % HalfNumElts;
11844 // We can shuffle with up to 2 half vectors, set the new 'half'
11845 // shuffle mask accordingly.
11846 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
11847 HalfMask[i] = HalfElt;
11848 HalfIdx1 = HalfIdx;
11851 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
11852 HalfMask[i] = HalfElt + HalfNumElts;
11853 HalfIdx2 = HalfIdx;
11857 // Too many half vectors referenced.
11860 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
11862 // Only shuffle the halves of the inputs when useful.
11863 int NumLowerHalves =
11864 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
11865 int NumUpperHalves =
11866 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
11868 // uuuuXXXX - don't extract uppers just to insert again.
11869 if (UndefLower && NumUpperHalves != 0)
11872 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
11873 if (UndefUpper && NumUpperHalves == 2)
11876 // AVX2 - XXXXuuuu - always extract lowers.
11877 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
11878 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
11879 if (VT == MVT::v4f64 || VT == MVT::v4i64)
11881 // AVX2 supports variable 32-bit element cross-lane shuffles.
11882 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
11883 // XXXXuuuu - don't extract lowers and uppers.
11884 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
11889 auto GetHalfVector = [&](int HalfIdx) {
11891 return DAG.getUNDEF(HalfVT);
11892 SDValue V = (HalfIdx < 2 ? V1 : V2);
11893 HalfIdx = (HalfIdx % 2) * HalfNumElts;
11894 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
11895 DAG.getIntPtrConstant(HalfIdx, DL));
11898 SDValue Half1 = GetHalfVector(HalfIdx1);
11899 SDValue Half2 = GetHalfVector(HalfIdx2);
11900 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
11901 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
11902 DAG.getIntPtrConstant(Offset, DL));
11905 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
11908 /// This returns true if the elements from a particular input are already in the
11909 /// slot required by the given mask and require no permutation.
11910 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
11911 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
11912 int Size = Mask.size();
11913 for (int i = 0; i < Size; ++i)
11914 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
11920 /// Handle case where shuffle sources are coming from the same 128-bit lane and
11921 /// every lane can be represented as the same repeating mask - allowing us to
11922 /// shuffle the sources with the repeating shuffle and then permute the result
11923 /// to the destination lanes.
11924 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
11925 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11926 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
11927 int NumElts = VT.getVectorNumElements();
11928 int NumLanes = VT.getSizeInBits() / 128;
11929 int NumLaneElts = NumElts / NumLanes;
11931 // On AVX2 we may be able to just shuffle the lowest elements and then
11932 // broadcast the result.
11933 if (Subtarget.hasAVX2()) {
11934 for (unsigned BroadcastSize : {16, 32, 64}) {
11935 if (BroadcastSize <= VT.getScalarSizeInBits())
11937 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
11939 // Attempt to match a repeating pattern every NumBroadcastElts,
11940 // accounting for UNDEFs but only references the lowest 128-bit
11941 // lane of the inputs.
11942 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
11943 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11944 for (int j = 0; j != NumBroadcastElts; ++j) {
11945 int M = Mask[i + j];
11948 int &R = RepeatMask[j];
11949 if (0 != ((M % NumElts) / NumLaneElts))
11951 if (0 <= R && R != M)
11958 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
11959 if (!FindRepeatingBroadcastMask(RepeatMask))
11962 // Shuffle the (lowest) repeated elements in place for broadcast.
11963 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
11965 // Shuffle the actual broadcast.
11966 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
11967 for (int i = 0; i != NumElts; i += NumBroadcastElts)
11968 for (int j = 0; j != NumBroadcastElts; ++j)
11969 BroadcastMask[i + j] = j;
11970 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
11975 // Bail if the shuffle mask doesn't cross 128-bit lanes.
11976 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
11979 // Bail if we already have a repeated lane shuffle mask.
11980 SmallVector<int, 8> RepeatedShuffleMask;
11981 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
11984 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
11985 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
11986 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
11987 int NumSubLanes = NumLanes * SubLaneScale;
11988 int NumSubLaneElts = NumLaneElts / SubLaneScale;
11990 // Check that all the sources are coming from the same lane and see if we can
11991 // form a repeating shuffle mask (local to each sub-lane). At the same time,
11992 // determine the source sub-lane for each destination sub-lane.
11993 int TopSrcSubLane = -1;
11994 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
11995 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
11996 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
11997 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
11999 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12000 // Extract the sub-lane mask, check that it all comes from the same lane
12001 // and normalize the mask entries to come from the first lane.
12003 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12004 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12005 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12008 int Lane = (M % NumElts) / NumLaneElts;
12009 if ((0 <= SrcLane) && (SrcLane != Lane))
12012 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12013 SubLaneMask[Elt] = LocalM;
12016 // Whole sub-lane is UNDEF.
12020 // Attempt to match against the candidate repeated sub-lane masks.
12021 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12022 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12023 for (int i = 0; i != NumSubLaneElts; ++i) {
12024 if (M1[i] < 0 || M2[i] < 0)
12026 if (M1[i] != M2[i])
12032 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12033 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12036 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12037 for (int i = 0; i != NumSubLaneElts; ++i) {
12038 int M = SubLaneMask[i];
12041 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12042 "Unexpected mask element");
12043 RepeatedSubLaneMask[i] = M;
12046 // Track the top most source sub-lane - by setting the remaining to UNDEF
12047 // we can greatly simplify shuffle matching.
12048 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12049 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12050 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12054 // Bail if we failed to find a matching repeated sub-lane mask.
12055 if (Dst2SrcSubLanes[DstSubLane] < 0)
12058 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12059 "Unexpected source lane");
12061 // Create a repeating shuffle mask for the entire vector.
12062 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12063 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12064 int Lane = SubLane / SubLaneScale;
12065 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12066 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12067 int M = RepeatedSubLaneMask[Elt];
12070 int Idx = (SubLane * NumSubLaneElts) + Elt;
12071 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12074 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12076 // Shuffle each source sub-lane to its destination.
12077 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12078 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12079 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12080 if (SrcSubLane < 0)
12082 for (int j = 0; j != NumSubLaneElts; ++j)
12083 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12086 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12090 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12091 unsigned &ShuffleImm,
12092 ArrayRef<int> Mask) {
12093 int NumElts = VT.getVectorNumElements();
12094 assert(VT.getScalarType() == MVT::f64 &&
12095 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12096 "Unexpected data type for VSHUFPD");
12098 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12099 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12101 bool ShufpdMask = true;
12102 bool CommutableMask = true;
12103 for (int i = 0; i < NumElts; ++i) {
12104 if (Mask[i] == SM_SentinelUndef)
12108 int Val = (i & 6) + NumElts * (i & 1);
12109 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12110 if (Mask[i] < Val || Mask[i] > Val + 1)
12111 ShufpdMask = false;
12112 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12113 CommutableMask = false;
12114 ShuffleImm |= (Mask[i] % 2) << i;
12119 if (CommutableMask) {
12127 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12128 ArrayRef<int> Mask, SDValue V1,
12129 SDValue V2, SelectionDAG &DAG) {
12130 unsigned Immediate = 0;
12131 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12134 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12135 DAG.getConstant(Immediate, DL, MVT::i8));
12138 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12139 ArrayRef<int> Mask, SDValue V1,
12140 SDValue V2, SelectionDAG &DAG) {
12141 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12142 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12144 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12146 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12148 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12151 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12153 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12154 /// isn't available.
12155 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12156 const SmallBitVector &Zeroable,
12157 SDValue V1, SDValue V2,
12158 const X86Subtarget &Subtarget,
12159 SelectionDAG &DAG) {
12160 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12161 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12162 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12164 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12165 Zeroable, Subtarget, DAG))
12168 if (V2.isUndef()) {
12169 // Check for being able to broadcast a single element.
12170 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12171 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12174 // Use low duplicate instructions for masks that match their pattern.
12175 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12176 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12178 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12179 // Non-half-crossing single input shuffles can be lowered with an
12180 // interleaved permutation.
12181 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12182 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12183 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12184 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12187 // With AVX2 we have direct support for this permutation.
12188 if (Subtarget.hasAVX2())
12189 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12190 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12192 // Try to create an in-lane repeating shuffle mask and then shuffle the
12193 // the results into the target lanes.
12194 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12195 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12198 // Otherwise, fall back.
12199 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12203 // Use dedicated unpack instructions for masks that match their pattern.
12205 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12208 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12209 Zeroable, Subtarget, DAG))
12212 // Check if the blend happens to exactly fit that of SHUFPD.
12214 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12217 // Try to create an in-lane repeating shuffle mask and then shuffle the
12218 // the results into the target lanes.
12219 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12220 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12223 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12224 // shuffle. However, if we have AVX2 and either inputs are already in place,
12225 // we will be able to shuffle even across lanes the other input in a single
12226 // instruction so skip this pattern.
12227 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12228 isShuffleMaskInputInPlace(1, Mask))))
12229 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12230 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12232 // If we have VLX support, we can use VEXPAND.
12233 if (Subtarget.hasVLX())
12234 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12235 V1, V2, DAG, Subtarget))
12238 // If we have AVX2 then we always want to lower with a blend because an v4 we
12239 // can fully permute the elements.
12240 if (Subtarget.hasAVX2())
12241 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12244 // Otherwise fall back on generic lowering.
12245 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12248 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12250 /// This routine is only called when we have AVX2 and thus a reasonable
12251 /// instruction set for v4i64 shuffling..
12252 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12253 const SmallBitVector &Zeroable,
12254 SDValue V1, SDValue V2,
12255 const X86Subtarget &Subtarget,
12256 SelectionDAG &DAG) {
12257 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12258 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12259 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12260 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12262 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12263 Zeroable, Subtarget, DAG))
12266 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12267 Zeroable, Subtarget, DAG))
12270 // Check for being able to broadcast a single element.
12271 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12272 Mask, Subtarget, DAG))
12275 if (V2.isUndef()) {
12276 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12277 // can use lower latency instructions that will operate on both lanes.
12278 SmallVector<int, 2> RepeatedMask;
12279 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12280 SmallVector<int, 4> PSHUFDMask;
12281 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12282 return DAG.getBitcast(
12284 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12285 DAG.getBitcast(MVT::v8i32, V1),
12286 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12289 // AVX2 provides a direct instruction for permuting a single input across
12291 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12292 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12295 // Try to use shift instructions.
12296 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12297 Zeroable, Subtarget, DAG))
12300 // If we have VLX support, we can use VALIGN or VEXPAND.
12301 if (Subtarget.hasVLX()) {
12302 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12303 Mask, Subtarget, DAG))
12306 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12307 V1, V2, DAG, Subtarget))
12311 // Try to use PALIGNR.
12312 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12313 Mask, Subtarget, DAG))
12316 // Use dedicated unpack instructions for masks that match their pattern.
12318 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12321 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12322 // shuffle. However, if we have AVX2 and either inputs are already in place,
12323 // we will be able to shuffle even across lanes the other input in a single
12324 // instruction so skip this pattern.
12325 if (!isShuffleMaskInputInPlace(0, Mask) &&
12326 !isShuffleMaskInputInPlace(1, Mask))
12327 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12328 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12331 // Otherwise fall back on generic blend lowering.
12332 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12336 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12338 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12339 /// isn't available.
12340 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12341 const SmallBitVector &Zeroable,
12342 SDValue V1, SDValue V2,
12343 const X86Subtarget &Subtarget,
12344 SelectionDAG &DAG) {
12345 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12346 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12347 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12349 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12350 Zeroable, Subtarget, DAG))
12353 // Check for being able to broadcast a single element.
12354 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12355 Mask, Subtarget, DAG))
12358 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12359 // options to efficiently lower the shuffle.
12360 SmallVector<int, 4> RepeatedMask;
12361 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12362 assert(RepeatedMask.size() == 4 &&
12363 "Repeated masks must be half the mask width!");
12365 // Use even/odd duplicate instructions for masks that match their pattern.
12366 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12367 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12368 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12369 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12372 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12373 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12375 // Use dedicated unpack instructions for masks that match their pattern.
12377 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12380 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12381 // have already handled any direct blends.
12382 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12385 // Try to create an in-lane repeating shuffle mask and then shuffle the
12386 // the results into the target lanes.
12387 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12388 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12391 // If we have a single input shuffle with different shuffle patterns in the
12392 // two 128-bit lanes use the variable mask to VPERMILPS.
12393 if (V2.isUndef()) {
12394 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12395 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12396 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12398 if (Subtarget.hasAVX2())
12399 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12401 // Otherwise, fall back.
12402 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12406 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12408 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12409 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12411 // If we have VLX support, we can use VEXPAND.
12412 if (Subtarget.hasVLX())
12413 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12414 V1, V2, DAG, Subtarget))
12417 // If we have AVX2 then we always want to lower with a blend because at v8 we
12418 // can fully permute the elements.
12419 if (Subtarget.hasAVX2())
12420 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12423 // Otherwise fall back on generic lowering.
12424 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12427 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12429 /// This routine is only called when we have AVX2 and thus a reasonable
12430 /// instruction set for v8i32 shuffling..
12431 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12432 const SmallBitVector &Zeroable,
12433 SDValue V1, SDValue V2,
12434 const X86Subtarget &Subtarget,
12435 SelectionDAG &DAG) {
12436 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12437 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12438 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12439 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12441 // Whenever we can lower this as a zext, that instruction is strictly faster
12442 // than any alternative. It also allows us to fold memory operands into the
12443 // shuffle in many cases.
12444 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12445 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12448 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12449 Zeroable, Subtarget, DAG))
12452 // Check for being able to broadcast a single element.
12453 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12454 Mask, Subtarget, DAG))
12457 // If the shuffle mask is repeated in each 128-bit lane we can use more
12458 // efficient instructions that mirror the shuffles across the two 128-bit
12460 SmallVector<int, 4> RepeatedMask;
12461 bool Is128BitLaneRepeatedShuffle =
12462 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12463 if (Is128BitLaneRepeatedShuffle) {
12464 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12466 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12467 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12469 // Use dedicated unpack instructions for masks that match their pattern.
12471 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
12475 // Try to use shift instructions.
12476 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
12477 Zeroable, Subtarget, DAG))
12480 // If we have VLX support, we can use VALIGN or EXPAND.
12481 if (Subtarget.hasVLX()) {
12482 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
12483 Mask, Subtarget, DAG))
12486 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
12487 V1, V2, DAG, Subtarget))
12491 // Try to use byte rotation instructions.
12492 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12493 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12496 // Try to create an in-lane repeating shuffle mask and then shuffle the
12497 // results into the target lanes.
12498 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12499 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12502 // If the shuffle patterns aren't repeated but it is a single input, directly
12503 // generate a cross-lane VPERMD instruction.
12504 if (V2.isUndef()) {
12505 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12506 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
12509 // Assume that a single SHUFPS is faster than an alternative sequence of
12510 // multiple instructions (even if the CPU has a domain penalty).
12511 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
12512 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
12513 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
12514 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
12515 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
12516 CastV1, CastV2, DAG);
12517 return DAG.getBitcast(MVT::v8i32, ShufPS);
12520 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12522 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12523 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
12526 // Otherwise fall back on generic blend lowering.
12527 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
12531 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
12533 /// This routine is only called when we have AVX2 and thus a reasonable
12534 /// instruction set for v16i16 shuffling..
12535 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12536 const SmallBitVector &Zeroable,
12537 SDValue V1, SDValue V2,
12538 const X86Subtarget &Subtarget,
12539 SelectionDAG &DAG) {
12540 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12541 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
12542 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12543 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
12545 // Whenever we can lower this as a zext, that instruction is strictly faster
12546 // than any alternative. It also allows us to fold memory operands into the
12547 // shuffle in many cases.
12548 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12549 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
12552 // Check for being able to broadcast a single element.
12553 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
12554 Mask, Subtarget, DAG))
12557 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
12558 Zeroable, Subtarget, DAG))
12561 // Use dedicated unpack instructions for masks that match their pattern.
12563 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
12566 // Try to use shift instructions.
12567 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
12568 Zeroable, Subtarget, DAG))
12571 // Try to use byte rotation instructions.
12572 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12573 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12576 // Try to create an in-lane repeating shuffle mask and then shuffle the
12577 // the results into the target lanes.
12578 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12579 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12582 if (V2.isUndef()) {
12583 // There are no generalized cross-lane shuffle operations available on i16
12585 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
12586 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
12589 SmallVector<int, 8> RepeatedMask;
12590 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
12591 // As this is a single-input shuffle, the repeated mask should be
12592 // a strictly valid v8i16 mask that we can pass through to the v8i16
12593 // lowering to handle even the v16 case.
12594 return lowerV8I16GeneralSingleInputVectorShuffle(
12595 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
12599 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12600 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
12603 // AVX512BWVL can lower to VPERMW.
12604 if (Subtarget.hasBWI() && Subtarget.hasVLX())
12605 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
12607 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12609 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12610 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
12613 // Otherwise fall back on generic lowering.
12614 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
12617 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
12619 /// This routine is only called when we have AVX2 and thus a reasonable
12620 /// instruction set for v32i8 shuffling..
12621 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12622 const SmallBitVector &Zeroable,
12623 SDValue V1, SDValue V2,
12624 const X86Subtarget &Subtarget,
12625 SelectionDAG &DAG) {
12626 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12627 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
12628 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
12629 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
12631 // Whenever we can lower this as a zext, that instruction is strictly faster
12632 // than any alternative. It also allows us to fold memory operands into the
12633 // shuffle in many cases.
12634 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12635 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
12638 // Check for being able to broadcast a single element.
12639 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
12640 Mask, Subtarget, DAG))
12643 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
12644 Zeroable, Subtarget, DAG))
12647 // Use dedicated unpack instructions for masks that match their pattern.
12649 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
12652 // Try to use shift instructions.
12653 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
12654 Zeroable, Subtarget, DAG))
12657 // Try to use byte rotation instructions.
12658 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
12659 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12662 // Try to create an in-lane repeating shuffle mask and then shuffle the
12663 // the results into the target lanes.
12664 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12665 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12668 // There are no generalized cross-lane shuffle operations available on i8
12670 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
12671 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
12674 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
12675 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
12678 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12680 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12681 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
12684 // Otherwise fall back on generic lowering.
12685 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
12688 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
12690 /// This routine either breaks down the specific type of a 256-bit x86 vector
12691 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
12692 /// together based on the available instructions.
12693 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12694 MVT VT, SDValue V1, SDValue V2,
12695 const SmallBitVector &Zeroable,
12696 const X86Subtarget &Subtarget,
12697 SelectionDAG &DAG) {
12698 // If we have a single input to the zero element, insert that into V1 if we
12699 // can do so cheaply.
12700 int NumElts = VT.getVectorNumElements();
12701 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
12703 if (NumV2Elements == 1 && Mask[0] >= NumElts)
12704 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
12705 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
12708 // Handle special cases where the lower or upper half is UNDEF.
12710 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
12713 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
12714 // can check for those subtargets here and avoid much of the subtarget
12715 // querying in the per-vector-type lowering routines. With AVX1 we have
12716 // essentially *zero* ability to manipulate a 256-bit vector with integer
12717 // types. Since we'll use floating point types there eventually, just
12718 // immediately cast everything to a float and operate entirely in that domain.
12719 if (VT.isInteger() && !Subtarget.hasAVX2()) {
12720 int ElementBits = VT.getScalarSizeInBits();
12721 if (ElementBits < 32) {
12722 // No floating point type available, if we can't use the bit operations
12723 // for masking/blending then decompose into 128-bit vectors.
12725 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
12727 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
12729 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12732 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
12733 VT.getVectorNumElements());
12734 V1 = DAG.getBitcast(FpVT, V1);
12735 V2 = DAG.getBitcast(FpVT, V2);
12736 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
12739 switch (VT.SimpleTy) {
12741 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12743 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12745 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12747 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12749 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12751 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
12754 llvm_unreachable("Not a valid 256-bit x86 vector type!");
12758 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
12759 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
12760 ArrayRef<int> Mask, SDValue V1,
12761 SDValue V2, SelectionDAG &DAG) {
12762 assert(VT.getScalarSizeInBits() == 64 &&
12763 "Unexpected element type size for 128bit shuffle.");
12765 // To handle 256 bit vector requires VLX and most probably
12766 // function lowerV2X128VectorShuffle() is better solution.
12767 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
12769 SmallVector<int, 4> WidenedMask;
12770 if (!canWidenShuffleElements(Mask, WidenedMask))
12773 // Check for patterns which can be matched with a single insert of a 256-bit
12775 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
12776 {0, 1, 2, 3, 0, 1, 2, 3});
12777 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
12778 {0, 1, 2, 3, 8, 9, 10, 11})) {
12779 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
12780 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12781 DAG.getIntPtrConstant(0, DL));
12782 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12783 OnlyUsesV1 ? V1 : V2,
12784 DAG.getIntPtrConstant(0, DL));
12785 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12788 assert(WidenedMask.size() == 4);
12790 // See if this is an insertion of the lower 128-bits of V2 into V1.
12791 bool IsInsert = true;
12793 for (int i = 0; i < 4; ++i) {
12794 assert(WidenedMask[i] >= -1);
12795 if (WidenedMask[i] < 0)
12798 // Make sure all V1 subvectors are in place.
12799 if (WidenedMask[i] < 4) {
12800 if (WidenedMask[i] != i) {
12805 // Make sure we only have a single V2 index and its the lowest 128-bits.
12806 if (V2Index >= 0 || WidenedMask[i] != 4) {
12813 if (IsInsert && V2Index >= 0) {
12814 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
12815 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
12816 DAG.getIntPtrConstant(0, DL));
12817 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
12820 // Try to lower to to vshuf64x2/vshuf32x4.
12821 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
12822 unsigned PermMask = 0;
12823 // Insure elements came from the same Op.
12824 for (int i = 0; i < 4; ++i) {
12825 assert(WidenedMask[i] >= -1);
12826 if (WidenedMask[i] < 0)
12829 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
12830 unsigned OpIndex = i / 2;
12831 if (Ops[OpIndex].isUndef())
12833 else if (Ops[OpIndex] != Op)
12836 // Convert the 128-bit shuffle mask selection values into 128-bit selection
12837 // bits defined by a vshuf64x2 instruction's immediate control byte.
12838 PermMask |= (WidenedMask[i] % 4) << (i * 2);
12841 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
12842 DAG.getConstant(PermMask, DL, MVT::i8));
12845 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
12846 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12847 const SmallBitVector &Zeroable,
12848 SDValue V1, SDValue V2,
12849 const X86Subtarget &Subtarget,
12850 SelectionDAG &DAG) {
12851 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12852 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
12853 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12855 if (V2.isUndef()) {
12856 // Use low duplicate instructions for masks that match their pattern.
12857 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
12858 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
12860 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
12861 // Non-half-crossing single input shuffles can be lowered with an
12862 // interleaved permutation.
12863 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12864 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
12865 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
12866 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
12867 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
12868 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12871 SmallVector<int, 4> RepeatedMask;
12872 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
12873 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
12874 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12877 if (SDValue Shuf128 =
12878 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
12881 if (SDValue Unpck =
12882 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
12885 // Check if the blend happens to exactly fit that of SHUFPD.
12887 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
12890 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
12891 V2, DAG, Subtarget))
12894 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
12897 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
12898 static SDValue lowerV16F32VectorShuffle(SDLoc DL, ArrayRef<int> Mask,
12899 const SmallBitVector &Zeroable,
12900 SDValue V1, SDValue V2,
12901 const X86Subtarget &Subtarget,
12902 SelectionDAG &DAG) {
12903 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12904 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
12905 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
12907 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12908 // options to efficiently lower the shuffle.
12909 SmallVector<int, 4> RepeatedMask;
12910 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
12911 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12913 // Use even/odd duplicate instructions for masks that match their pattern.
12914 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12915 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
12916 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12917 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
12920 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
12921 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12923 // Use dedicated unpack instructions for masks that match their pattern.
12924 if (SDValue Unpck =
12925 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
12928 // Otherwise, fall back to a SHUFPS sequence.
12929 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
12931 // If we have AVX512F support, we can use VEXPAND.
12932 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
12933 V1, V2, DAG, Subtarget))
12936 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
12939 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
12940 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12941 const SmallBitVector &Zeroable,
12942 SDValue V1, SDValue V2,
12943 const X86Subtarget &Subtarget,
12944 SelectionDAG &DAG) {
12945 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12946 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
12947 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12949 if (SDValue Shuf128 =
12950 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
12953 if (V2.isUndef()) {
12954 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12955 // can use lower latency instructions that will operate on all four
12957 SmallVector<int, 2> Repeated128Mask;
12958 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
12959 SmallVector<int, 4> PSHUFDMask;
12960 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
12961 return DAG.getBitcast(
12963 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
12964 DAG.getBitcast(MVT::v16i32, V1),
12965 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12968 SmallVector<int, 4> Repeated256Mask;
12969 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
12970 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
12971 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
12974 // Try to use shift instructions.
12975 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
12976 Zeroable, Subtarget, DAG))
12979 // Try to use VALIGN.
12980 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
12981 Mask, Subtarget, DAG))
12984 // Try to use PALIGNR.
12985 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
12986 Mask, Subtarget, DAG))
12989 if (SDValue Unpck =
12990 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
12992 // If we have AVX512F support, we can use VEXPAND.
12993 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
12994 V2, DAG, Subtarget))
12997 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13000 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13001 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13002 const SmallBitVector &Zeroable,
13003 SDValue V1, SDValue V2,
13004 const X86Subtarget &Subtarget,
13005 SelectionDAG &DAG) {
13006 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13007 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13008 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13010 // Whenever we can lower this as a zext, that instruction is strictly faster
13011 // than any alternative. It also allows us to fold memory operands into the
13012 // shuffle in many cases.
13013 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13014 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13017 // If the shuffle mask is repeated in each 128-bit lane we can use more
13018 // efficient instructions that mirror the shuffles across the four 128-bit
13020 SmallVector<int, 4> RepeatedMask;
13021 bool Is128BitLaneRepeatedShuffle =
13022 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13023 if (Is128BitLaneRepeatedShuffle) {
13024 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13026 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13027 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13029 // Use dedicated unpack instructions for masks that match their pattern.
13031 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13035 // Try to use shift instructions.
13036 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13037 Zeroable, Subtarget, DAG))
13040 // Try to use VALIGN.
13041 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13042 Mask, Subtarget, DAG))
13045 // Try to use byte rotation instructions.
13046 if (Subtarget.hasBWI())
13047 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13048 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13051 // Assume that a single SHUFPS is faster than using a permv shuffle.
13052 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13053 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13054 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13055 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13056 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13057 CastV1, CastV2, DAG);
13058 return DAG.getBitcast(MVT::v16i32, ShufPS);
13060 // If we have AVX512F support, we can use VEXPAND.
13061 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13062 V1, V2, DAG, Subtarget))
13065 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13068 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13069 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13070 const SmallBitVector &Zeroable,
13071 SDValue V1, SDValue V2,
13072 const X86Subtarget &Subtarget,
13073 SelectionDAG &DAG) {
13074 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13075 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13076 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13077 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13079 // Whenever we can lower this as a zext, that instruction is strictly faster
13080 // than any alternative. It also allows us to fold memory operands into the
13081 // shuffle in many cases.
13082 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13083 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13086 // Use dedicated unpack instructions for masks that match their pattern.
13088 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13091 // Try to use shift instructions.
13092 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13093 Zeroable, Subtarget, DAG))
13096 // Try to use byte rotation instructions.
13097 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13098 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13101 if (V2.isUndef()) {
13102 SmallVector<int, 8> RepeatedMask;
13103 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13104 // As this is a single-input shuffle, the repeated mask should be
13105 // a strictly valid v8i16 mask that we can pass through to the v8i16
13106 // lowering to handle even the v32 case.
13107 return lowerV8I16GeneralSingleInputVectorShuffle(
13108 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13112 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13115 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13116 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13117 const SmallBitVector &Zeroable,
13118 SDValue V1, SDValue V2,
13119 const X86Subtarget &Subtarget,
13120 SelectionDAG &DAG) {
13121 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13122 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13123 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13124 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13126 // Whenever we can lower this as a zext, that instruction is strictly faster
13127 // than any alternative. It also allows us to fold memory operands into the
13128 // shuffle in many cases.
13129 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13130 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13133 // Use dedicated unpack instructions for masks that match their pattern.
13135 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13138 // Try to use shift instructions.
13139 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13140 Zeroable, Subtarget, DAG))
13143 // Try to use byte rotation instructions.
13144 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13145 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13148 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13149 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13152 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13153 if (Subtarget.hasVBMI())
13154 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13156 // Try to create an in-lane repeating shuffle mask and then shuffle the
13157 // the results into the target lanes.
13158 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13159 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13162 // FIXME: Implement direct support for this type!
13163 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13166 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13168 /// This routine either breaks down the specific type of a 512-bit x86 vector
13169 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13170 /// together based on the available instructions.
13171 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13172 MVT VT, SDValue V1, SDValue V2,
13173 const SmallBitVector &Zeroable,
13174 const X86Subtarget &Subtarget,
13175 SelectionDAG &DAG) {
13176 assert(Subtarget.hasAVX512() &&
13177 "Cannot lower 512-bit vectors w/ basic ISA!");
13179 // If we have a single input to the zero element, insert that into V1 if we
13180 // can do so cheaply.
13181 int NumElts = Mask.size();
13182 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13184 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13185 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13186 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13189 // Check for being able to broadcast a single element.
13190 if (SDValue Broadcast =
13191 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13194 // Dispatch to each element type for lowering. If we don't have support for
13195 // specific element type shuffles at 512 bits, immediately split them and
13196 // lower them. Each lowering routine of a given type is allowed to assume that
13197 // the requisite ISA extensions for that element type are available.
13198 switch (VT.SimpleTy) {
13200 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13202 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13204 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13206 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13208 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13210 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13213 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13217 // Lower vXi1 vector shuffles.
13218 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13219 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13220 // vector, shuffle and then truncate it back.
13221 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13222 MVT VT, SDValue V1, SDValue V2,
13223 const X86Subtarget &Subtarget,
13224 SelectionDAG &DAG) {
13225 assert(Subtarget.hasAVX512() &&
13226 "Cannot lower 512-bit vectors w/o basic ISA!");
13228 switch (VT.SimpleTy) {
13230 llvm_unreachable("Expected a vector of i1 elements");
13232 ExtVT = MVT::v2i64;
13235 ExtVT = MVT::v4i32;
13238 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13241 ExtVT = MVT::v16i32;
13244 ExtVT = MVT::v32i16;
13247 ExtVT = MVT::v64i8;
13251 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13252 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13253 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13254 V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13256 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13259 V2 = DAG.getUNDEF(ExtVT);
13260 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13261 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13262 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13263 V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
13265 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13267 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13268 // i1 was sign extended we can use X86ISD::CVT2MASK.
13269 int NumElems = VT.getVectorNumElements();
13270 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13271 (Subtarget.hasDQI() && (NumElems < 32)))
13272 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13274 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13277 /// Helper function that returns true if the shuffle mask should be
13278 /// commuted to improve canonicalization.
13279 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13280 int NumElements = Mask.size();
13282 int NumV1Elements = 0, NumV2Elements = 0;
13286 else if (M < NumElements)
13291 // Commute the shuffle as needed such that more elements come from V1 than
13292 // V2. This allows us to match the shuffle pattern strictly on how many
13293 // elements come from V1 without handling the symmetric cases.
13294 if (NumV2Elements > NumV1Elements)
13297 assert(NumV1Elements > 0 && "No V1 indices");
13299 if (NumV2Elements == 0)
13302 // When the number of V1 and V2 elements are the same, try to minimize the
13303 // number of uses of V2 in the low half of the vector. When that is tied,
13304 // ensure that the sum of indices for V1 is equal to or lower than the sum
13305 // indices for V2. When those are equal, try to ensure that the number of odd
13306 // indices for V1 is lower than the number of odd indices for V2.
13307 if (NumV1Elements == NumV2Elements) {
13308 int LowV1Elements = 0, LowV2Elements = 0;
13309 for (int M : Mask.slice(0, NumElements / 2))
13310 if (M >= NumElements)
13314 if (LowV2Elements > LowV1Elements)
13316 if (LowV2Elements == LowV1Elements) {
13317 int SumV1Indices = 0, SumV2Indices = 0;
13318 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13319 if (Mask[i] >= NumElements)
13321 else if (Mask[i] >= 0)
13323 if (SumV2Indices < SumV1Indices)
13325 if (SumV2Indices == SumV1Indices) {
13326 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13327 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13328 if (Mask[i] >= NumElements)
13329 NumV2OddIndices += i % 2;
13330 else if (Mask[i] >= 0)
13331 NumV1OddIndices += i % 2;
13332 if (NumV2OddIndices < NumV1OddIndices)
13341 /// \brief Top-level lowering for x86 vector shuffles.
13343 /// This handles decomposition, canonicalization, and lowering of all x86
13344 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13345 /// above in helper routines. The canonicalization attempts to widen shuffles
13346 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13347 /// s.t. only one of the two inputs needs to be tested, etc.
13348 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13349 SelectionDAG &DAG) {
13350 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13351 ArrayRef<int> Mask = SVOp->getMask();
13352 SDValue V1 = Op.getOperand(0);
13353 SDValue V2 = Op.getOperand(1);
13354 MVT VT = Op.getSimpleValueType();
13355 int NumElements = VT.getVectorNumElements();
13357 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13359 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13360 "Can't lower MMX shuffles");
13362 bool V1IsUndef = V1.isUndef();
13363 bool V2IsUndef = V2.isUndef();
13364 if (V1IsUndef && V2IsUndef)
13365 return DAG.getUNDEF(VT);
13367 // When we create a shuffle node we put the UNDEF node to second operand,
13368 // but in some cases the first operand may be transformed to UNDEF.
13369 // In this case we should just commute the node.
13371 return DAG.getCommutedVectorShuffle(*SVOp);
13373 // Check for non-undef masks pointing at an undef vector and make the masks
13374 // undef as well. This makes it easier to match the shuffle based solely on
13378 if (M >= NumElements) {
13379 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13380 for (int &M : NewMask)
13381 if (M >= NumElements)
13383 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13386 // Check for illegal shuffle mask element index values.
13387 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13388 assert(llvm::all_of(Mask,
13389 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13390 "Out of bounds shuffle index");
13392 // We actually see shuffles that are entirely re-arrangements of a set of
13393 // zero inputs. This mostly happens while decomposing complex shuffles into
13394 // simple ones. Directly lower these as a buildvector of zeros.
13395 SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13396 if (Zeroable.all())
13397 return getZeroVector(VT, Subtarget, DAG, DL);
13399 // Try to collapse shuffles into using a vector type with fewer elements but
13400 // wider element types. We cap this to not form integers or floating point
13401 // elements wider than 64 bits, but it might be interesting to form i128
13402 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13403 SmallVector<int, 16> WidenedMask;
13404 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13405 canWidenShuffleElements(Mask, WidenedMask)) {
13406 MVT NewEltVT = VT.isFloatingPoint()
13407 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13408 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13409 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13410 // Make sure that the new vector type is legal. For example, v2f64 isn't
13412 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13413 V1 = DAG.getBitcast(NewVT, V1);
13414 V2 = DAG.getBitcast(NewVT, V2);
13415 return DAG.getBitcast(
13416 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13420 // Commute the shuffle if it will improve canonicalization.
13421 if (canonicalizeShuffleMaskWithCommute(Mask))
13422 return DAG.getCommutedVectorShuffle(*SVOp);
13424 // For each vector width, delegate to a specialized lowering routine.
13425 if (VT.is128BitVector())
13426 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13429 if (VT.is256BitVector())
13430 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13433 if (VT.is512BitVector())
13434 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13438 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13440 llvm_unreachable("Unimplemented!");
13443 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13444 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
13445 const X86Subtarget &Subtarget,
13446 SelectionDAG &DAG) {
13447 SDValue Cond = Op.getOperand(0);
13448 SDValue LHS = Op.getOperand(1);
13449 SDValue RHS = Op.getOperand(2);
13451 MVT VT = Op.getSimpleValueType();
13453 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
13455 auto *CondBV = cast<BuildVectorSDNode>(Cond);
13457 // Only non-legal VSELECTs reach this lowering, convert those into generic
13458 // shuffles and re-use the shuffle lowering path for blends.
13459 SmallVector<int, 32> Mask;
13460 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
13461 SDValue CondElt = CondBV->getOperand(i);
13463 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
13466 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
13469 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
13470 // A vselect where all conditions and data are constants can be optimized into
13471 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
13472 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
13473 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
13474 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
13477 // Try to lower this to a blend-style vector shuffle. This can handle all
13478 // constant condition cases.
13479 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
13482 // Variable blends are only legal from SSE4.1 onward.
13483 if (!Subtarget.hasSSE41())
13486 // Only some types will be legal on some subtargets. If we can emit a legal
13487 // VSELECT-matching blend, return Op, and but if we need to expand, return
13489 switch (Op.getSimpleValueType().SimpleTy) {
13491 // Most of the vector types have blends past SSE4.1.
13495 // The byte blends for AVX vectors were introduced only in AVX2.
13496 if (Subtarget.hasAVX2())
13503 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
13504 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13507 // FIXME: We should custom lower this by fixing the condition and using i8
13513 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
13514 MVT VT = Op.getSimpleValueType();
13517 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
13520 if (VT.getSizeInBits() == 8) {
13521 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
13522 Op.getOperand(0), Op.getOperand(1));
13523 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13524 DAG.getValueType(VT));
13525 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13528 if (VT == MVT::f32) {
13529 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
13530 // the result back to FR32 register. It's only worth matching if the
13531 // result has a single use which is a store or a bitcast to i32. And in
13532 // the case of a store, it's not worth it if the index is a constant 0,
13533 // because a MOVSSmr can be used instead, which is smaller and faster.
13534 if (!Op.hasOneUse())
13536 SDNode *User = *Op.getNode()->use_begin();
13537 if ((User->getOpcode() != ISD::STORE ||
13538 isNullConstant(Op.getOperand(1))) &&
13539 (User->getOpcode() != ISD::BITCAST ||
13540 User->getValueType(0) != MVT::i32))
13542 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13543 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
13545 return DAG.getBitcast(MVT::f32, Extract);
13548 if (VT == MVT::i32 || VT == MVT::i64) {
13549 // ExtractPS/pextrq works with constant index.
13550 if (isa<ConstantSDNode>(Op.getOperand(1)))
13557 /// Extract one bit from mask vector, like v16i1 or v8i1.
13558 /// AVX-512 feature.
13560 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
13561 SDValue Vec = Op.getOperand(0);
13563 MVT VecVT = Vec.getSimpleValueType();
13564 SDValue Idx = Op.getOperand(1);
13565 MVT EltVT = Op.getSimpleValueType();
13567 assert((EltVT == MVT::i1) && "Unexpected operands in ExtractBitFromMaskVector");
13568 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
13569 "Unexpected vector type in ExtractBitFromMaskVector");
13571 // variable index can't be handled in mask registers,
13572 // extend vector to VR512
13573 if (!isa<ConstantSDNode>(Idx)) {
13574 MVT ExtVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13575 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
13576 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
13577 ExtVT.getVectorElementType(), Ext, Idx);
13578 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
13581 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13582 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
13583 (VecVT.getVectorNumElements() < 8)) {
13584 // Use kshiftlw/rw instruction.
13585 VecVT = MVT::v16i1;
13586 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
13587 DAG.getUNDEF(VecVT),
13589 DAG.getIntPtrConstant(0, dl));
13591 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
13592 if (MaxSift - IdxVal)
13593 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13594 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
13595 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13596 DAG.getConstant(MaxSift, dl, MVT::i8));
13597 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i1, Vec,
13598 DAG.getIntPtrConstant(0, dl));
13602 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
13603 SelectionDAG &DAG) const {
13605 SDValue Vec = Op.getOperand(0);
13606 MVT VecVT = Vec.getSimpleValueType();
13607 SDValue Idx = Op.getOperand(1);
13609 if (Op.getSimpleValueType() == MVT::i1)
13610 return ExtractBitFromMaskVector(Op, DAG);
13612 if (!isa<ConstantSDNode>(Idx)) {
13613 if (VecVT.is512BitVector() ||
13614 (VecVT.is256BitVector() && Subtarget.hasInt256() &&
13615 VecVT.getScalarSizeInBits() == 32)) {
13618 MVT::getIntegerVT(VecVT.getScalarSizeInBits());
13619 MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
13620 MaskEltVT.getSizeInBits());
13622 Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
13623 auto PtrVT = getPointerTy(DAG.getDataLayout());
13624 SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
13625 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
13626 DAG.getConstant(0, dl, PtrVT));
13627 SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
13628 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
13629 DAG.getConstant(0, dl, PtrVT));
13634 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13636 // If this is a 256-bit vector result, first extract the 128-bit vector and
13637 // then extract the element from the 128-bit vector.
13638 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
13639 // Get the 128-bit vector.
13640 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
13641 MVT EltVT = VecVT.getVectorElementType();
13643 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
13644 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
13646 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
13647 // this can be done with a mask.
13648 IdxVal &= ElemsPerChunk - 1;
13649 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
13650 DAG.getConstant(IdxVal, dl, MVT::i32));
13653 assert(VecVT.is128BitVector() && "Unexpected vector length");
13655 MVT VT = Op.getSimpleValueType();
13657 if (VT.getSizeInBits() == 16) {
13658 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
13659 // we're going to zero extend the register or fold the store (SSE41 only).
13660 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
13661 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
13662 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
13663 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
13664 DAG.getBitcast(MVT::v4i32, Vec), Idx));
13666 // Transform it so it match pextrw which produces a 32-bit result.
13667 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
13668 Op.getOperand(0), Op.getOperand(1));
13669 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
13670 DAG.getValueType(VT));
13671 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
13674 if (Subtarget.hasSSE41())
13675 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
13678 // TODO: handle v16i8.
13680 if (VT.getSizeInBits() == 32) {
13684 // SHUFPS the element to the lowest double word, then movss.
13685 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
13686 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13687 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13688 DAG.getIntPtrConstant(0, dl));
13691 if (VT.getSizeInBits() == 64) {
13692 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
13693 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
13694 // to match extract_elt for f64.
13698 // UNPCKHPD the element to the lowest double word, then movsd.
13699 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
13700 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
13701 int Mask[2] = { 1, -1 };
13702 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
13703 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
13704 DAG.getIntPtrConstant(0, dl));
13710 /// Insert one bit to mask vector, like v16i1 or v8i1.
13711 /// AVX-512 feature.
13713 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
13715 SDValue Vec = Op.getOperand(0);
13716 SDValue Elt = Op.getOperand(1);
13717 SDValue Idx = Op.getOperand(2);
13718 MVT VecVT = Vec.getSimpleValueType();
13720 if (!isa<ConstantSDNode>(Idx)) {
13721 // Non constant index. Extend source and destination,
13722 // insert element and then truncate the result.
13723 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
13724 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
13725 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
13726 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
13727 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
13728 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
13731 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13732 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
13733 unsigned NumElems = VecVT.getVectorNumElements();
13735 if(Vec.isUndef()) {
13737 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13738 DAG.getConstant(IdxVal, dl, MVT::i8));
13742 // Insertion of one bit into first or last position
13743 // can be done with two SHIFTs + OR.
13744 if (IdxVal == 0 ) {
13745 // EltInVec already at correct index and other bits are 0.
13746 // Clean the first bit in source vector.
13747 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13748 DAG.getConstant(1 , dl, MVT::i8));
13749 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13750 DAG.getConstant(1, dl, MVT::i8));
13752 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13754 if (IdxVal == NumElems -1) {
13755 // Move the bit to the last position inside the vector.
13756 EltInVec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, EltInVec,
13757 DAG.getConstant(IdxVal, dl, MVT::i8));
13758 // Clean the last bit in the source vector.
13759 Vec = DAG.getNode(X86ISD::VSHLI, dl, VecVT, Vec,
13760 DAG.getConstant(1, dl, MVT::i8));
13761 Vec = DAG.getNode(X86ISD::VSRLI, dl, VecVT, Vec,
13762 DAG.getConstant(1 , dl, MVT::i8));
13764 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
13767 // Use shuffle to insert element.
13768 SmallVector<int, 64> MaskVec(NumElems);
13769 for (unsigned i = 0; i != NumElems; ++i)
13770 MaskVec[i] = (i == IdxVal) ? NumElems : i;
13772 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
13775 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
13776 SelectionDAG &DAG) const {
13777 MVT VT = Op.getSimpleValueType();
13778 MVT EltVT = VT.getVectorElementType();
13779 unsigned NumElts = VT.getVectorNumElements();
13781 if (EltVT == MVT::i1)
13782 return InsertBitToMaskVector(Op, DAG);
13785 SDValue N0 = Op.getOperand(0);
13786 SDValue N1 = Op.getOperand(1);
13787 SDValue N2 = Op.getOperand(2);
13788 if (!isa<ConstantSDNode>(N2))
13790 auto *N2C = cast<ConstantSDNode>(N2);
13791 unsigned IdxVal = N2C->getZExtValue();
13793 // If we are clearing out a element, we do this more efficiently with a
13794 // blend shuffle than a costly integer insertion.
13795 // TODO: would other rematerializable values (e.g. allbits) benefit as well?
13796 // TODO: pre-SSE41 targets will tend to use bit masking - this could still
13797 // be beneficial if we are inserting several zeros and can combine the masks.
13798 if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
13799 SmallVector<int, 8> ClearMask;
13800 for (unsigned i = 0; i != NumElts; ++i)
13801 ClearMask.push_back(i == IdxVal ? i + NumElts : i);
13802 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
13803 return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
13806 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
13807 // into that, and then insert the subvector back into the result.
13808 if (VT.is256BitVector() || VT.is512BitVector()) {
13809 // With a 256-bit vector, we can insert into the zero element efficiently
13810 // using a blend if we have AVX or AVX2 and the right data type.
13811 if (VT.is256BitVector() && IdxVal == 0) {
13812 // TODO: It is worthwhile to cast integer to floating point and back
13813 // and incur a domain crossing penalty if that's what we'll end up
13814 // doing anyway after extracting to a 128-bit vector.
13815 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
13816 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
13817 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
13818 N2 = DAG.getIntPtrConstant(1, dl);
13819 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
13823 // Get the desired 128-bit vector chunk.
13824 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
13826 // Insert the element into the desired chunk.
13827 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
13828 assert(isPowerOf2_32(NumEltsIn128));
13829 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
13830 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
13832 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
13833 DAG.getConstant(IdxIn128, dl, MVT::i32));
13835 // Insert the changed part back into the bigger vector
13836 return insert128BitVector(N0, V, IdxVal, DAG, dl);
13838 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
13840 if (Subtarget.hasSSE41()) {
13841 if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
13843 if (VT == MVT::v8i16) {
13844 Opc = X86ISD::PINSRW;
13846 assert(VT == MVT::v16i8);
13847 Opc = X86ISD::PINSRB;
13850 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
13852 if (N1.getValueType() != MVT::i32)
13853 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13854 if (N2.getValueType() != MVT::i32)
13855 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13856 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
13859 if (EltVT == MVT::f32) {
13860 // Bits [7:6] of the constant are the source select. This will always be
13861 // zero here. The DAG Combiner may combine an extract_elt index into
13862 // these bits. For example (insert (extract, 3), 2) could be matched by
13863 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
13864 // Bits [5:4] of the constant are the destination select. This is the
13865 // value of the incoming immediate.
13866 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
13867 // combine either bitwise AND or insert of float 0.0 to set these bits.
13869 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
13870 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
13871 // If this is an insertion of 32-bits into the low 32-bits of
13872 // a vector, we prefer to generate a blend with immediate rather
13873 // than an insertps. Blends are simpler operations in hardware and so
13874 // will always have equal or better performance than insertps.
13875 // But if optimizing for size and there's a load folding opportunity,
13876 // generate insertps because blendps does not have a 32-bit memory
13878 N2 = DAG.getIntPtrConstant(1, dl);
13879 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13880 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
13882 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
13883 // Create this as a scalar to vector..
13884 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
13885 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
13888 if (EltVT == MVT::i32 || EltVT == MVT::i64) {
13889 // PINSR* works with constant index.
13894 if (EltVT == MVT::i8)
13897 if (EltVT.getSizeInBits() == 16) {
13898 // Transform it so it match pinsrw which expects a 16-bit value in a GR32
13899 // as its second argument.
13900 if (N1.getValueType() != MVT::i32)
13901 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
13902 if (N2.getValueType() != MVT::i32)
13903 N2 = DAG.getIntPtrConstant(IdxVal, dl);
13904 return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
13909 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) {
13911 MVT OpVT = Op.getSimpleValueType();
13913 // If this is a 256-bit vector result, first insert into a 128-bit
13914 // vector and then insert into the 256-bit vector.
13915 if (!OpVT.is128BitVector()) {
13916 // Insert into a 128-bit vector.
13917 unsigned SizeFactor = OpVT.getSizeInBits()/128;
13918 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
13919 OpVT.getVectorNumElements() / SizeFactor);
13921 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
13923 // Insert the 128-bit vector.
13924 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
13927 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
13928 assert(OpVT.is128BitVector() && "Expected an SSE type!");
13929 return DAG.getBitcast(
13930 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
13933 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
13934 // a simple subregister reference or explicit instructions to grab
13935 // upper bits of a vector.
13936 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13937 SelectionDAG &DAG) {
13938 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
13941 SDValue In = Op.getOperand(0);
13942 SDValue Idx = Op.getOperand(1);
13943 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13944 MVT ResVT = Op.getSimpleValueType();
13946 assert((In.getSimpleValueType().is256BitVector() ||
13947 In.getSimpleValueType().is512BitVector()) &&
13948 "Can only extract from 256-bit or 512-bit vectors");
13950 if (ResVT.is128BitVector())
13951 return extract128BitVector(In, IdxVal, DAG, dl);
13952 if (ResVT.is256BitVector())
13953 return extract256BitVector(In, IdxVal, DAG, dl);
13955 llvm_unreachable("Unimplemented!");
13958 static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
13959 for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
13960 if (llvm::all_of(ValidUsers,
13961 [&I](SDValue V) { return V.getNode() != *I; }))
13966 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
13967 // simple superregister reference or explicit instructions to insert
13968 // the upper bits of a vector.
13969 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
13970 SelectionDAG &DAG) {
13971 assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
13974 SDValue Vec = Op.getOperand(0);
13975 SDValue SubVec = Op.getOperand(1);
13976 SDValue Idx = Op.getOperand(2);
13978 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
13979 MVT OpVT = Op.getSimpleValueType();
13980 MVT SubVecVT = SubVec.getSimpleValueType();
13982 if (OpVT.getVectorElementType() == MVT::i1)
13983 return insert1BitVector(Op, DAG, Subtarget);
13985 assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
13986 "Can only insert into 256-bit or 512-bit vectors");
13988 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
13990 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13991 // (load16 addr + 16), Elts/2)
13994 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
13995 // (load32 addr + 32), Elts/2)
13997 // or a 16-byte or 32-byte broadcast:
13998 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
13999 // (load16 addr), Elts/2)
14000 // --> X86SubVBroadcast(load16 addr)
14002 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
14003 // (load32 addr), Elts/2)
14004 // --> X86SubVBroadcast(load32 addr)
14005 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
14006 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
14007 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
14008 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
14009 if (Idx2 && Idx2->getZExtValue() == 0) {
14010 SDValue SubVec2 = Vec.getOperand(1);
14011 // If needed, look through bitcasts to get to the load.
14012 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
14014 unsigned Alignment = FirstLd->getAlignment();
14015 unsigned AS = FirstLd->getAddressSpace();
14016 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
14017 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
14018 OpVT, AS, Alignment, &Fast) && Fast) {
14019 SDValue Ops[] = {SubVec2, SubVec};
14020 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
14024 // If lower/upper loads are the same and the only users of the load, then
14025 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
14026 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
14027 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
14028 areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
14029 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
14032 // If this is subv_broadcast insert into both halves, use a larger
14034 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
14035 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
14036 SubVec.getOperand(0));
14041 if (SubVecVT.is128BitVector())
14042 return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
14044 if (SubVecVT.is256BitVector())
14045 return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
14047 llvm_unreachable("Unimplemented!");
14050 // Returns the appropriate wrapper opcode for a global reference.
14051 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14052 // References to absolute symbols are never PC-relative.
14053 if (GV && GV->isAbsoluteSymbolRef())
14054 return X86ISD::Wrapper;
14056 CodeModel::Model M = getTargetMachine().getCodeModel();
14057 if (Subtarget.isPICStyleRIPRel() &&
14058 (M == CodeModel::Small || M == CodeModel::Kernel))
14059 return X86ISD::WrapperRIP;
14061 return X86ISD::Wrapper;
14064 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14065 // their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
14066 // one of the above mentioned nodes. It has to be wrapped because otherwise
14067 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14068 // be used to form addressing mode. These wrapped nodes will be selected
14071 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14072 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14074 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14075 // global base reg.
14076 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14078 auto PtrVT = getPointerTy(DAG.getDataLayout());
14079 SDValue Result = DAG.getTargetConstantPool(
14080 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14082 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14083 // With PIC, the address is actually $g + Offset.
14086 DAG.getNode(ISD::ADD, DL, PtrVT,
14087 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14093 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14094 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14096 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14097 // global base reg.
14098 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14100 auto PtrVT = getPointerTy(DAG.getDataLayout());
14101 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14103 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14105 // With PIC, the address is actually $g + Offset.
14108 DAG.getNode(ISD::ADD, DL, PtrVT,
14109 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14115 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14116 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14118 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14119 // global base reg.
14120 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14121 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14123 auto PtrVT = getPointerTy(DAG.getDataLayout());
14124 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14127 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14129 // With PIC, the address is actually $g + Offset.
14130 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14132 DAG.getNode(ISD::ADD, DL, PtrVT,
14133 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14136 // For symbols that require a load from a stub to get the address, emit the
14138 if (isGlobalStubReference(OpFlag))
14139 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14140 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14146 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14147 // Create the TargetBlockAddressAddress node.
14148 unsigned char OpFlags =
14149 Subtarget.classifyBlockAddressReference();
14150 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14151 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14153 auto PtrVT = getPointerTy(DAG.getDataLayout());
14154 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14155 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14157 // With PIC, the address is actually $g + Offset.
14158 if (isGlobalRelativeToPICBase(OpFlags)) {
14159 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14160 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14166 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14167 const SDLoc &dl, int64_t Offset,
14168 SelectionDAG &DAG) const {
14169 // Create the TargetGlobalAddress node, folding in the constant
14170 // offset if it is legal.
14171 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14172 CodeModel::Model M = DAG.getTarget().getCodeModel();
14173 auto PtrVT = getPointerTy(DAG.getDataLayout());
14175 if (OpFlags == X86II::MO_NO_FLAG &&
14176 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14177 // A direct static reference to a global.
14178 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14181 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14184 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14186 // With PIC, the address is actually $g + Offset.
14187 if (isGlobalRelativeToPICBase(OpFlags)) {
14188 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14189 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14192 // For globals that require a load from a stub to get the address, emit the
14194 if (isGlobalStubReference(OpFlags))
14195 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14196 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14198 // If there was a non-zero offset that we didn't fold, create an explicit
14199 // addition for it.
14201 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14202 DAG.getConstant(Offset, dl, PtrVT));
14208 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14209 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14210 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14211 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14215 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14216 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14217 unsigned char OperandFlags, bool LocalDynamic = false) {
14218 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14219 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14221 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14222 GA->getValueType(0),
14226 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14230 SDValue Ops[] = { Chain, TGA, *InFlag };
14231 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14233 SDValue Ops[] = { Chain, TGA };
14234 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14237 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14238 MFI.setAdjustsStack(true);
14239 MFI.setHasCalls(true);
14241 SDValue Flag = Chain.getValue(1);
14242 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14245 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14247 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14250 SDLoc dl(GA); // ? function entry point might be better
14251 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14252 DAG.getNode(X86ISD::GlobalBaseReg,
14253 SDLoc(), PtrVT), InFlag);
14254 InFlag = Chain.getValue(1);
14256 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14259 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14261 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14263 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14264 X86::RAX, X86II::MO_TLSGD);
14267 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14273 // Get the start address of the TLS block for this module.
14274 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14275 .getInfo<X86MachineFunctionInfo>();
14276 MFI->incNumLocalDynamicTLSAccesses();
14280 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14281 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14284 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14285 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14286 InFlag = Chain.getValue(1);
14287 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14288 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14291 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14295 unsigned char OperandFlags = X86II::MO_DTPOFF;
14296 unsigned WrapperKind = X86ISD::Wrapper;
14297 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14298 GA->getValueType(0),
14299 GA->getOffset(), OperandFlags);
14300 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14302 // Add x@dtpoff with the base.
14303 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14306 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14307 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14308 const EVT PtrVT, TLSModel::Model model,
14309 bool is64Bit, bool isPIC) {
14312 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14313 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14314 is64Bit ? 257 : 256));
14316 SDValue ThreadPointer =
14317 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14318 MachinePointerInfo(Ptr));
14320 unsigned char OperandFlags = 0;
14321 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14323 unsigned WrapperKind = X86ISD::Wrapper;
14324 if (model == TLSModel::LocalExec) {
14325 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14326 } else if (model == TLSModel::InitialExec) {
14328 OperandFlags = X86II::MO_GOTTPOFF;
14329 WrapperKind = X86ISD::WrapperRIP;
14331 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14334 llvm_unreachable("Unexpected model");
14337 // emit "addl x@ntpoff,%eax" (local exec)
14338 // or "addl x@indntpoff,%eax" (initial exec)
14339 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14341 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14342 GA->getOffset(), OperandFlags);
14343 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14345 if (model == TLSModel::InitialExec) {
14346 if (isPIC && !is64Bit) {
14347 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14348 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14352 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14353 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14356 // The address of the thread local variable is the add of the thread
14357 // pointer with the offset of the variable.
14358 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14362 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14364 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14366 if (DAG.getTarget().Options.EmulatedTLS)
14367 return LowerToTLSEmulatedModel(GA, DAG);
14369 const GlobalValue *GV = GA->getGlobal();
14370 auto PtrVT = getPointerTy(DAG.getDataLayout());
14371 bool PositionIndependent = isPositionIndependent();
14373 if (Subtarget.isTargetELF()) {
14374 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14376 case TLSModel::GeneralDynamic:
14377 if (Subtarget.is64Bit())
14378 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14379 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14380 case TLSModel::LocalDynamic:
14381 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14382 Subtarget.is64Bit());
14383 case TLSModel::InitialExec:
14384 case TLSModel::LocalExec:
14385 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14386 PositionIndependent);
14388 llvm_unreachable("Unknown TLS model.");
14391 if (Subtarget.isTargetDarwin()) {
14392 // Darwin only has one model of TLS. Lower to that.
14393 unsigned char OpFlag = 0;
14394 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14395 X86ISD::WrapperRIP : X86ISD::Wrapper;
14397 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14398 // global base reg.
14399 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14401 OpFlag = X86II::MO_TLVP_PIC_BASE;
14403 OpFlag = X86II::MO_TLVP;
14405 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14406 GA->getValueType(0),
14407 GA->getOffset(), OpFlag);
14408 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14410 // With PIC32, the address is actually $g + Offset.
14412 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14413 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14416 // Lowering the machine isd will make sure everything is in the right
14418 SDValue Chain = DAG.getEntryNode();
14419 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14420 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL);
14421 SDValue Args[] = { Chain, Offset };
14422 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14423 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14424 DAG.getIntPtrConstant(0, DL, true),
14425 Chain.getValue(1), DL);
14427 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14428 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14429 MFI.setAdjustsStack(true);
14431 // And our return value (tls address) is in the standard call return value
14433 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14434 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14437 if (Subtarget.isTargetKnownWindowsMSVC() ||
14438 Subtarget.isTargetWindowsItanium() ||
14439 Subtarget.isTargetWindowsGNU()) {
14440 // Just use the implicit TLS architecture
14441 // Need to generate someting similar to:
14442 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14444 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14445 // mov rcx, qword [rdx+rcx*8]
14446 // mov eax, .tls$:tlsvar
14447 // [rax+rcx] contains the address
14448 // Windows 64bit: gs:0x58
14449 // Windows 32bit: fs:__tls_array
14452 SDValue Chain = DAG.getEntryNode();
14454 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
14455 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
14456 // use its literal value of 0x2C.
14457 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
14458 ? Type::getInt8PtrTy(*DAG.getContext(),
14460 : Type::getInt32PtrTy(*DAG.getContext(),
14463 SDValue TlsArray = Subtarget.is64Bit()
14464 ? DAG.getIntPtrConstant(0x58, dl)
14465 : (Subtarget.isTargetWindowsGNU()
14466 ? DAG.getIntPtrConstant(0x2C, dl)
14467 : DAG.getExternalSymbol("_tls_array", PtrVT));
14469 SDValue ThreadPointer =
14470 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
14473 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
14474 res = ThreadPointer;
14476 // Load the _tls_index variable
14477 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
14478 if (Subtarget.is64Bit())
14479 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
14480 MachinePointerInfo(), MVT::i32);
14482 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
14484 auto &DL = DAG.getDataLayout();
14486 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
14487 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
14489 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
14492 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
14494 // Get the offset of start of .tls section
14495 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14496 GA->getValueType(0),
14497 GA->getOffset(), X86II::MO_SECREL);
14498 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
14500 // The address of the thread local variable is the add of the thread
14501 // pointer with the offset of the variable.
14502 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
14505 llvm_unreachable("TLS not implemented for this target.");
14508 /// Lower SRA_PARTS and friends, which return two i32 values
14509 /// and take a 2 x i32 value to shift plus a shift amount.
14510 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
14511 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
14512 MVT VT = Op.getSimpleValueType();
14513 unsigned VTBits = VT.getSizeInBits();
14515 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
14516 SDValue ShOpLo = Op.getOperand(0);
14517 SDValue ShOpHi = Op.getOperand(1);
14518 SDValue ShAmt = Op.getOperand(2);
14519 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
14520 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
14522 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14523 DAG.getConstant(VTBits - 1, dl, MVT::i8));
14524 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
14525 DAG.getConstant(VTBits - 1, dl, MVT::i8))
14526 : DAG.getConstant(0, dl, VT);
14528 SDValue Tmp2, Tmp3;
14529 if (Op.getOpcode() == ISD::SHL_PARTS) {
14530 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
14531 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
14533 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
14534 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
14537 // If the shift amount is larger or equal than the width of a part we can't
14538 // rely on the results of shld/shrd. Insert a test and select the appropriate
14539 // values for large shift amounts.
14540 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
14541 DAG.getConstant(VTBits, dl, MVT::i8));
14542 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
14543 AndNode, DAG.getConstant(0, dl, MVT::i8));
14546 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
14547 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
14548 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
14550 if (Op.getOpcode() == ISD::SHL_PARTS) {
14551 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14552 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14554 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
14555 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
14558 SDValue Ops[2] = { Lo, Hi };
14559 return DAG.getMergeValues(Ops, dl);
14562 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
14563 SelectionDAG &DAG) const {
14564 SDValue Src = Op.getOperand(0);
14565 MVT SrcVT = Src.getSimpleValueType();
14566 MVT VT = Op.getSimpleValueType();
14569 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14570 if (SrcVT.isVector()) {
14571 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
14572 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
14573 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
14574 DAG.getUNDEF(SrcVT)));
14576 if (SrcVT.getVectorElementType() == MVT::i1) {
14577 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
14578 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14579 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
14580 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14581 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14582 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
14587 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
14588 "Unknown SINT_TO_FP to lower!");
14590 // These are really Legal; return the operand so the caller accepts it as
14592 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
14594 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14595 Subtarget.is64Bit()) {
14599 SDValue ValueToStore = Op.getOperand(0);
14600 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
14601 !Subtarget.is64Bit())
14602 // Bitcasting to f64 here allows us to do a single 64-bit store from
14603 // an SSE register, avoiding the store forwarding penalty that would come
14604 // with two 32-bit stores.
14605 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
14607 unsigned Size = SrcVT.getSizeInBits()/8;
14608 MachineFunction &MF = DAG.getMachineFunction();
14609 auto PtrVT = getPointerTy(MF.getDataLayout());
14610 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
14611 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14612 SDValue Chain = DAG.getStore(
14613 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
14614 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14615 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
14618 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
14620 SelectionDAG &DAG) const {
14624 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
14626 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
14628 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
14630 unsigned ByteSize = SrcVT.getSizeInBits()/8;
14632 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
14633 MachineMemOperand *MMO;
14635 int SSFI = FI->getIndex();
14636 MMO = DAG.getMachineFunction().getMachineMemOperand(
14637 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14638 MachineMemOperand::MOLoad, ByteSize, ByteSize);
14640 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
14641 StackSlot = StackSlot.getOperand(1);
14643 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
14644 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
14646 Tys, Ops, SrcVT, MMO);
14649 Chain = Result.getValue(1);
14650 SDValue InFlag = Result.getValue(2);
14652 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
14653 // shouldn't be necessary except that RFP cannot be live across
14654 // multiple blocks. When stackifier is fixed, they can be uncoupled.
14655 MachineFunction &MF = DAG.getMachineFunction();
14656 unsigned SSFISize = Op.getValueSizeInBits()/8;
14657 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
14658 auto PtrVT = getPointerTy(MF.getDataLayout());
14659 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
14660 Tys = DAG.getVTList(MVT::Other);
14662 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
14664 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
14665 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
14666 MachineMemOperand::MOStore, SSFISize, SSFISize);
14668 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
14669 Ops, Op.getValueType(), MMO);
14670 Result = DAG.getLoad(
14671 Op.getValueType(), DL, Chain, StackSlot,
14672 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
14678 /// 64-bit unsigned integer to double expansion.
14679 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
14680 SelectionDAG &DAG) const {
14681 // This algorithm is not obvious. Here it is what we're trying to output:
14684 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
14685 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
14687 haddpd %xmm0, %xmm0
14689 pshufd $0x4e, %xmm0, %xmm1
14695 LLVMContext *Context = DAG.getContext();
14697 // Build some magic constants.
14698 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
14699 Constant *C0 = ConstantDataVector::get(*Context, CV0);
14700 auto PtrVT = getPointerTy(DAG.getDataLayout());
14701 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
14703 SmallVector<Constant*,2> CV1;
14705 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14706 APInt(64, 0x4330000000000000ULL))));
14708 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
14709 APInt(64, 0x4530000000000000ULL))));
14710 Constant *C1 = ConstantVector::get(CV1);
14711 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
14713 // Load the 64-bit value into an XMM register.
14714 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
14717 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
14718 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14719 /* Alignment = */ 16);
14721 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
14724 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
14725 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
14726 /* Alignment = */ 16);
14727 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
14728 // TODO: Are there any fast-math-flags to propagate here?
14729 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
14732 if (Subtarget.hasSSE3()) {
14733 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
14734 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
14736 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
14737 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
14738 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
14739 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
14742 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
14743 DAG.getIntPtrConstant(0, dl));
14746 /// 32-bit unsigned integer to float expansion.
14747 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
14748 SelectionDAG &DAG) const {
14750 // FP constant to bias correct the final result.
14751 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
14754 // Load the 32-bit value into an XMM register.
14755 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
14758 // Zero out the upper parts of the register.
14759 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
14761 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14762 DAG.getBitcast(MVT::v2f64, Load),
14763 DAG.getIntPtrConstant(0, dl));
14765 // Or the load with the bias.
14766 SDValue Or = DAG.getNode(
14767 ISD::OR, dl, MVT::v2i64,
14768 DAG.getBitcast(MVT::v2i64,
14769 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
14770 DAG.getBitcast(MVT::v2i64,
14771 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
14773 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
14774 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
14776 // Subtract the bias.
14777 // TODO: Are there any fast-math-flags to propagate here?
14778 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
14780 // Handle final rounding.
14781 MVT DestVT = Op.getSimpleValueType();
14783 if (DestVT.bitsLT(MVT::f64))
14784 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
14785 DAG.getIntPtrConstant(0, dl));
14786 if (DestVT.bitsGT(MVT::f64))
14787 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
14789 // Handle final rounding.
14793 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
14794 const X86Subtarget &Subtarget, SDLoc &DL) {
14795 if (Op.getSimpleValueType() != MVT::v2f64)
14798 SDValue N0 = Op.getOperand(0);
14799 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
14801 // Legalize to v4i32 type.
14802 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
14803 DAG.getUNDEF(MVT::v2i32));
14805 if (Subtarget.hasAVX512())
14806 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
14808 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
14809 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
14810 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
14811 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
14813 // Two to the power of half-word-size.
14814 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
14816 // Clear upper part of LO, lower HI.
14817 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
14818 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
14820 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
14821 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
14822 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
14824 // Add the two halves.
14825 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
14828 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
14829 const X86Subtarget &Subtarget) {
14830 // The algorithm is the following:
14831 // #ifdef __SSE4_1__
14832 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14833 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14834 // (uint4) 0x53000000, 0xaa);
14836 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14837 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14839 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14840 // return (float4) lo + fhi;
14842 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
14843 // reassociate the two FADDs, and if we do that, the algorithm fails
14844 // spectacularly (PR24512).
14845 // FIXME: If we ever have some kind of Machine FMF, this should be marked
14846 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
14847 // there's also the MachineCombiner reassociations happening on Machine IR.
14848 if (DAG.getTarget().Options.UnsafeFPMath)
14852 SDValue V = Op->getOperand(0);
14853 MVT VecIntVT = V.getSimpleValueType();
14854 bool Is128 = VecIntVT == MVT::v4i32;
14855 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
14856 // If we convert to something else than the supported type, e.g., to v4f64,
14858 if (VecFloatVT != Op->getSimpleValueType(0))
14861 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
14862 "Unsupported custom type");
14864 // In the #idef/#else code, we have in common:
14865 // - The vector of constants:
14871 // Create the splat vector for 0x4b000000.
14872 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
14873 // Create the splat vector for 0x53000000.
14874 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
14876 // Create the right shift.
14877 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
14878 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
14881 if (Subtarget.hasSSE41()) {
14882 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
14883 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
14884 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
14885 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
14886 // Low will be bitcasted right away, so do not bother bitcasting back to its
14888 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
14889 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14890 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
14891 // (uint4) 0x53000000, 0xaa);
14892 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
14893 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
14894 // High will be bitcasted right away, so do not bother bitcasting back to
14895 // its original type.
14896 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
14897 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
14899 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
14900 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
14901 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
14902 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
14904 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
14905 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
14908 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
14909 SDValue VecCstFAdd = DAG.getConstantFP(
14910 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
14912 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
14913 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
14914 // TODO: Are there any fast-math-flags to propagate here?
14916 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
14917 // return (float4) lo + fhi;
14918 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
14919 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
14922 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
14923 SelectionDAG &DAG) const {
14924 SDValue N0 = Op.getOperand(0);
14925 MVT SrcVT = N0.getSimpleValueType();
14928 if (SrcVT.getVectorElementType() == MVT::i1) {
14929 if (SrcVT == MVT::v2i1)
14930 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14931 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
14932 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14933 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14934 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
14937 switch (SrcVT.SimpleTy) {
14939 llvm_unreachable("Custom UINT_TO_FP is not supported!");
14944 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
14945 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
14946 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
14949 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
14952 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
14955 assert(Subtarget.hasAVX512());
14956 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
14957 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
14961 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
14962 SelectionDAG &DAG) const {
14963 SDValue N0 = Op.getOperand(0);
14965 auto PtrVT = getPointerTy(DAG.getDataLayout());
14967 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
14968 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
14969 // the optimization here.
14970 if (DAG.SignBitIsZero(N0))
14971 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
14973 if (Op.getSimpleValueType().isVector())
14974 return lowerUINT_TO_FP_vec(Op, DAG);
14976 MVT SrcVT = N0.getSimpleValueType();
14977 MVT DstVT = Op.getSimpleValueType();
14979 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
14980 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
14981 // Conversions from unsigned i32 to f32/f64 are legal,
14982 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
14986 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
14987 return LowerUINT_TO_FP_i64(Op, DAG);
14988 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
14989 return LowerUINT_TO_FP_i32(Op, DAG);
14990 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
14993 // Make a 64-bit buffer, and use it to build an FILD.
14994 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
14995 if (SrcVT == MVT::i32) {
14996 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
14997 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
14998 StackSlot, MachinePointerInfo());
14999 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15000 OffsetSlot, MachinePointerInfo());
15001 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15005 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15006 SDValue ValueToStore = Op.getOperand(0);
15007 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15008 // Bitcasting to f64 here allows us to do a single 64-bit store from
15009 // an SSE register, avoiding the store forwarding penalty that would come
15010 // with two 32-bit stores.
15011 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15012 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15013 MachinePointerInfo());
15014 // For i64 source, we need to add the appropriate power of 2 if the input
15015 // was negative. This is the same as the optimization in
15016 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15017 // we must be careful to do the computation in x87 extended precision, not
15018 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15019 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15020 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15021 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15022 MachineMemOperand::MOLoad, 8, 8);
15024 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15025 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15026 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15029 APInt FF(32, 0x5F800000ULL);
15031 // Check whether the sign bit is set.
15032 SDValue SignSet = DAG.getSetCC(
15033 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15034 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15036 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15037 SDValue FudgePtr = DAG.getConstantPool(
15038 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15040 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15041 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15042 SDValue Four = DAG.getIntPtrConstant(4, dl);
15043 SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet,
15045 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15047 // Load the value out, extending it from f32 to f80.
15048 // FIXME: Avoid the extend by constructing the right constant pool?
15049 SDValue Fudge = DAG.getExtLoad(
15050 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15051 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15052 /* Alignment = */ 4);
15053 // Extend everything to 80 bits to force it to be done on x87.
15054 // TODO: Are there any fast-math-flags to propagate here?
15055 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15056 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15057 DAG.getIntPtrConstant(0, dl));
15060 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15061 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15062 // just return an <SDValue(), SDValue()> pair.
15063 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15064 // to i16, i32 or i64, and we lower it to a legal sequence.
15065 // If lowered to the final integer result we return a <result, SDValue()> pair.
15066 // Otherwise we lower it to a sequence ending with a FIST, return a
15067 // <FIST, StackSlot> pair, and the caller is responsible for loading
15068 // the final integer result from StackSlot.
15069 std::pair<SDValue,SDValue>
15070 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15071 bool IsSigned, bool IsReplace) const {
15074 EVT DstTy = Op.getValueType();
15075 EVT TheVT = Op.getOperand(0).getValueType();
15076 auto PtrVT = getPointerTy(DAG.getDataLayout());
15078 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15079 // f16 must be promoted before using the lowering in this routine.
15080 // fp128 does not use this lowering.
15081 return std::make_pair(SDValue(), SDValue());
15084 // If using FIST to compute an unsigned i64, we'll need some fixup
15085 // to handle values above the maximum signed i64. A FIST is always
15086 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15087 bool UnsignedFixup = !IsSigned &&
15088 DstTy == MVT::i64 &&
15089 (!Subtarget.is64Bit() ||
15090 !isScalarFPTypeInSSEReg(TheVT));
15092 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15093 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15094 // The low 32 bits of the fist result will have the correct uint32 result.
15095 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15099 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15100 DstTy.getSimpleVT() >= MVT::i16 &&
15101 "Unknown FP_TO_INT to lower!");
15103 // These are really Legal.
15104 if (DstTy == MVT::i32 &&
15105 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15106 return std::make_pair(SDValue(), SDValue());
15107 if (Subtarget.is64Bit() &&
15108 DstTy == MVT::i64 &&
15109 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15110 return std::make_pair(SDValue(), SDValue());
15112 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15114 MachineFunction &MF = DAG.getMachineFunction();
15115 unsigned MemSize = DstTy.getSizeInBits()/8;
15116 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15117 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15120 switch (DstTy.getSimpleVT().SimpleTy) {
15121 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15122 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15123 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15124 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15127 SDValue Chain = DAG.getEntryNode();
15128 SDValue Value = Op.getOperand(0);
15129 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15131 if (UnsignedFixup) {
15133 // Conversion to unsigned i64 is implemented with a select,
15134 // depending on whether the source value fits in the range
15135 // of a signed i64. Let Thresh be the FP equivalent of
15136 // 0x8000000000000000ULL.
15138 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15139 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15140 // Fist-to-mem64 FistSrc
15141 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15142 // to XOR'ing the high 32 bits with Adjust.
15144 // Being a power of 2, Thresh is exactly representable in all FP formats.
15145 // For X87 we'd like to use the smallest FP type for this constant, but
15146 // for DAG type consistency we have to match the FP operand type.
15148 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15149 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15150 bool LosesInfo = false;
15151 if (TheVT == MVT::f64)
15152 // The rounding mode is irrelevant as the conversion should be exact.
15153 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15155 else if (TheVT == MVT::f80)
15156 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15157 APFloat::rmNearestTiesToEven, &LosesInfo);
15159 assert(Status == APFloat::opOK && !LosesInfo &&
15160 "FP conversion should have been exact");
15162 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15164 SDValue Cmp = DAG.getSetCC(DL,
15165 getSetCCResultType(DAG.getDataLayout(),
15166 *DAG.getContext(), TheVT),
15167 Value, ThreshVal, ISD::SETLT);
15168 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15169 DAG.getConstant(0, DL, MVT::i32),
15170 DAG.getConstant(0x80000000, DL, MVT::i32));
15171 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15172 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15173 *DAG.getContext(), TheVT),
15174 Value, ThreshVal, ISD::SETLT);
15175 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15178 // FIXME This causes a redundant load/store if the SSE-class value is already
15179 // in memory, such as if it is on the callstack.
15180 if (isScalarFPTypeInSSEReg(TheVT)) {
15181 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15182 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15183 MachinePointerInfo::getFixedStack(MF, SSFI));
15184 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15186 Chain, StackSlot, DAG.getValueType(TheVT)
15189 MachineMemOperand *MMO =
15190 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15191 MachineMemOperand::MOLoad, MemSize, MemSize);
15192 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15193 Chain = Value.getValue(1);
15194 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15195 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15198 MachineMemOperand *MMO =
15199 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15200 MachineMemOperand::MOStore, MemSize, MemSize);
15202 if (UnsignedFixup) {
15204 // Insert the FIST, load its result as two i32's,
15205 // and XOR the high i32 with Adjust.
15207 SDValue FistOps[] = { Chain, Value, StackSlot };
15208 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15209 FistOps, DstTy, MMO);
15212 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15213 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15216 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15217 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15219 if (Subtarget.is64Bit()) {
15220 // Join High32 and Low32 into a 64-bit result.
15221 // (High32 << 32) | Low32
15222 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15223 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15224 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15225 DAG.getConstant(32, DL, MVT::i8));
15226 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15227 return std::make_pair(Result, SDValue());
15230 SDValue ResultOps[] = { Low32, High32 };
15232 SDValue pair = IsReplace
15233 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15234 : DAG.getMergeValues(ResultOps, DL);
15235 return std::make_pair(pair, SDValue());
15237 // Build the FP_TO_INT*_IN_MEM
15238 SDValue Ops[] = { Chain, Value, StackSlot };
15239 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15241 return std::make_pair(FIST, StackSlot);
15245 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15246 const X86Subtarget &Subtarget) {
15247 MVT VT = Op->getSimpleValueType(0);
15248 SDValue In = Op->getOperand(0);
15249 MVT InVT = In.getSimpleValueType();
15252 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15253 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15255 // Optimize vectors in AVX mode:
15258 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15259 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15260 // Concat upper and lower parts.
15263 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15264 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15265 // Concat upper and lower parts.
15268 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15269 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15270 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15273 if (Subtarget.hasInt256())
15274 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15276 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15277 SDValue Undef = DAG.getUNDEF(InVT);
15278 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15279 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15280 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15282 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15283 VT.getVectorNumElements()/2);
15285 OpLo = DAG.getBitcast(HVT, OpLo);
15286 OpHi = DAG.getBitcast(HVT, OpHi);
15288 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15291 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15292 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15293 MVT VT = Op->getSimpleValueType(0);
15294 SDValue In = Op->getOperand(0);
15295 MVT InVT = In.getSimpleValueType();
15297 unsigned NumElts = VT.getVectorNumElements();
15299 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15300 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15301 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15303 if (InVT.getVectorElementType() != MVT::i1)
15306 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15308 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15309 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15312 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15314 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15316 SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero);
15318 return SelectedVal;
15319 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15322 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15323 SelectionDAG &DAG) {
15324 if (Subtarget.hasFp256())
15325 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15331 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15332 SelectionDAG &DAG) {
15334 MVT VT = Op.getSimpleValueType();
15335 SDValue In = Op.getOperand(0);
15336 MVT SVT = In.getSimpleValueType();
15338 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15339 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15341 if (Subtarget.hasFp256())
15342 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15345 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15346 VT.getVectorNumElements() != SVT.getVectorNumElements());
15350 /// Helper to recursively truncate vector elements in half with PACKSS.
15351 /// It makes use of the fact that vector comparison results will be all-zeros
15352 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15353 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15354 /// within each 128-bit lane.
15355 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15358 const X86Subtarget &Subtarget) {
15359 // Requires SSE2 but AVX512 has fast truncate.
15360 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15363 EVT SrcVT = In.getValueType();
15365 // No truncation required, we might get here due to recursive calls.
15366 if (SrcVT == DstVT)
15369 // We only support vector truncation to 128bits or greater from a
15370 // 256bits or greater source.
15371 if ((DstVT.getSizeInBits() % 128) != 0)
15373 if ((SrcVT.getSizeInBits() % 256) != 0)
15376 unsigned NumElems = SrcVT.getVectorNumElements();
15377 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15378 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15381 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15383 // Extract lower/upper subvectors.
15384 unsigned NumSubElts = NumElems / 2;
15385 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15386 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15387 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15389 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15390 if (SrcVT.is256BitVector()) {
15391 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15392 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15393 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15394 return DAG.getBitcast(DstVT, Res);
15397 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15398 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15399 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15400 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15401 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15402 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15404 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15405 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15406 Res = DAG.getBitcast(MVT::v4i64, Res);
15407 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15409 if (DstVT.is256BitVector())
15410 return DAG.getBitcast(DstVT, Res);
15412 // If 512bit -> 128bit truncate another stage.
15413 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15414 Res = DAG.getBitcast(PackedVT, Res);
15415 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15418 // Recursively pack lower/upper subvectors, concat result and pack again.
15419 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15420 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15421 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15422 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15424 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15425 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15426 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15429 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15430 const X86Subtarget &Subtarget) {
15433 MVT VT = Op.getSimpleValueType();
15434 SDValue In = Op.getOperand(0);
15435 MVT InVT = In.getSimpleValueType();
15437 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15439 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15440 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15441 if (InVT.getScalarSizeInBits() <= 16) {
15442 if (Subtarget.hasBWI()) {
15443 // legal, will go to VPMOVB2M, VPMOVW2M
15444 // Shift packed bytes not supported natively, bitcast to word
15445 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15446 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15447 DAG.getBitcast(ExtVT, In),
15448 DAG.getConstant(ShiftInx, DL, ExtVT));
15449 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15450 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15452 // Use TESTD/Q, extended vector to packed dword/qword.
15453 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
15454 "Unexpected vector type.");
15455 unsigned NumElts = InVT.getVectorNumElements();
15456 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15457 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
15459 ShiftInx = InVT.getScalarSizeInBits() - 1;
15462 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
15463 DAG.getConstant(ShiftInx, DL, InVT));
15464 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
15467 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
15469 MVT VT = Op.getSimpleValueType();
15470 SDValue In = Op.getOperand(0);
15471 MVT InVT = In.getSimpleValueType();
15473 if (VT == MVT::i1) {
15474 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
15475 "Invalid scalar TRUNCATE operation");
15476 if (InVT.getSizeInBits() >= 32)
15478 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
15479 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
15481 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
15482 "Invalid TRUNCATE operation");
15484 if (VT.getVectorElementType() == MVT::i1)
15485 return LowerTruncateVecI1(Op, DAG, Subtarget);
15487 // vpmovqb/w/d, vpmovdb/w, vpmovwb
15488 if (Subtarget.hasAVX512()) {
15489 // word to byte only under BWI
15490 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
15491 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
15492 DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
15493 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
15496 // Truncate with PACKSS if we are truncating a vector comparison result.
15497 // TODO: We should be able to support other operations as long as we
15498 // we are saturating+packing zero/all bits only.
15499 auto IsPackableComparison = [](SDValue V) {
15500 unsigned Opcode = V.getOpcode();
15501 return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
15502 Opcode == X86ISD::CMPP);
15505 if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
15506 all_of(In->ops(), IsPackableComparison))) {
15507 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
15511 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
15512 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
15513 if (Subtarget.hasInt256()) {
15514 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
15515 In = DAG.getBitcast(MVT::v8i32, In);
15516 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
15518 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
15519 DAG.getIntPtrConstant(0, DL));
15522 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15523 DAG.getIntPtrConstant(0, DL));
15524 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15525 DAG.getIntPtrConstant(2, DL));
15526 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15527 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15528 static const int ShufMask[] = {0, 2, 4, 6};
15529 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
15532 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
15533 // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
15534 if (Subtarget.hasInt256()) {
15535 In = DAG.getBitcast(MVT::v32i8, In);
15537 SmallVector<SDValue,32> pshufbMask;
15538 for (unsigned i = 0; i < 2; ++i) {
15539 pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
15540 pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
15541 pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
15542 pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
15543 pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
15544 pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
15545 pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
15546 pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
15547 for (unsigned j = 0; j < 8; ++j)
15548 pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
15550 SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
15551 In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
15552 In = DAG.getBitcast(MVT::v4i64, In);
15554 static const int ShufMask[] = {0, 2, -1, -1};
15555 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, DAG.getUNDEF(MVT::v4i64),
15557 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
15558 DAG.getIntPtrConstant(0, DL));
15559 return DAG.getBitcast(VT, In);
15562 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15563 DAG.getIntPtrConstant(0, DL));
15565 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
15566 DAG.getIntPtrConstant(4, DL));
15568 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
15569 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
15571 // The PSHUFB mask:
15572 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
15573 -1, -1, -1, -1, -1, -1, -1, -1};
15575 SDValue Undef = DAG.getUNDEF(MVT::v16i8);
15576 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
15577 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
15579 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
15580 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
15582 // The MOVLHPS Mask:
15583 static const int ShufMask2[] = {0, 1, 4, 5};
15584 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
15585 return DAG.getBitcast(MVT::v8i16, res);
15588 // Handle truncation of V256 to V128 using shuffles.
15589 if (!VT.is128BitVector() || !InVT.is256BitVector())
15592 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
15594 unsigned NumElems = VT.getVectorNumElements();
15595 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
15597 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
15598 // Prepare truncation shuffle mask
15599 for (unsigned i = 0; i != NumElems; ++i)
15600 MaskVec[i] = i * 2;
15601 SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
15602 DAG.getUNDEF(NVT), MaskVec);
15603 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
15604 DAG.getIntPtrConstant(0, DL));
15607 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
15608 const X86Subtarget &Subtarget,
15609 SelectionDAG &DAG) const {
15610 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
15612 MVT VT = Op.getSimpleValueType();
15614 if (VT.isVector()) {
15615 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
15616 SDValue Src = Op.getOperand(0);
15618 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
15619 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
15621 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
15622 DAG.getUNDEF(MVT::v2f32)));
15628 assert(!VT.isVector());
15630 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
15631 IsSigned, /*IsReplace=*/ false);
15632 SDValue FIST = Vals.first, StackSlot = Vals.second;
15633 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
15634 if (!FIST.getNode())
15637 if (StackSlot.getNode())
15638 // Load the result.
15639 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
15641 // The node is the result.
15645 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
15647 MVT VT = Op.getSimpleValueType();
15648 SDValue In = Op.getOperand(0);
15649 MVT SVT = In.getSimpleValueType();
15651 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
15653 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
15654 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
15655 In, DAG.getUNDEF(SVT)));
15658 /// The only differences between FABS and FNEG are the mask and the logic op.
15659 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
15660 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
15661 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
15662 "Wrong opcode for lowering FABS or FNEG.");
15664 bool IsFABS = (Op.getOpcode() == ISD::FABS);
15666 // If this is a FABS and it has an FNEG user, bail out to fold the combination
15667 // into an FNABS. We'll lower the FABS after that if it is still in use.
15669 for (SDNode *User : Op->uses())
15670 if (User->getOpcode() == ISD::FNEG)
15674 MVT VT = Op.getSimpleValueType();
15676 bool IsF128 = (VT == MVT::f128);
15678 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
15679 // decide if we should generate a 16-byte constant mask when we only need 4 or
15680 // 8 bytes for the scalar case.
15685 if (VT.isVector()) {
15687 EltVT = VT.getVectorElementType();
15688 } else if (IsF128) {
15689 // SSE instructions are used for optimized f128 logical operations.
15690 LogicVT = MVT::f128;
15693 // There are no scalar bitwise logical SSE/AVX instructions, so we
15694 // generate a 16-byte vector constant and logic op even for the scalar case.
15695 // Using a 16-byte mask allows folding the load of the mask with
15696 // the logic op, so it can save (~4 bytes) on code size.
15697 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15701 unsigned EltBits = EltVT.getSizeInBits();
15702 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
15704 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignBit(EltBits);
15705 const fltSemantics &Sem =
15706 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
15707 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15708 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
15710 SDValue Op0 = Op.getOperand(0);
15711 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
15713 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
15714 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
15716 if (VT.isVector() || IsF128)
15717 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15719 // For the scalar case extend to a 128-bit vector, perform the logic op,
15720 // and extract the scalar result back out.
15721 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
15722 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
15723 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
15724 DAG.getIntPtrConstant(0, dl));
15727 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
15728 SDValue Mag = Op.getOperand(0);
15729 SDValue Sign = Op.getOperand(1);
15732 // If the sign operand is smaller, extend it first.
15733 MVT VT = Op.getSimpleValueType();
15734 if (Sign.getSimpleValueType().bitsLT(VT))
15735 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
15737 // And if it is bigger, shrink it first.
15738 if (Sign.getSimpleValueType().bitsGT(VT))
15739 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
15741 // At this point the operands and the result should have the same
15742 // type, and that won't be f80 since that is not custom lowered.
15743 bool IsF128 = (VT == MVT::f128);
15744 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
15745 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
15746 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
15747 "Unexpected type in LowerFCOPYSIGN");
15749 MVT EltVT = VT.getScalarType();
15750 const fltSemantics &Sem =
15751 EltVT == MVT::f64 ? APFloat::IEEEdouble()
15752 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
15754 // Perform all scalar logic operations as 16-byte vectors because there are no
15755 // scalar FP logic instructions in SSE.
15756 // TODO: This isn't necessary. If we used scalar types, we might avoid some
15757 // unnecessary splats, but we might miss load folding opportunities. Should
15758 // this decision be based on OptimizeForSize?
15759 bool IsFakeVector = !VT.isVector() && !IsF128;
15762 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
15764 // The mask constants are automatically splatted for vector types.
15765 unsigned EltSizeInBits = VT.getScalarSizeInBits();
15766 SDValue SignMask = DAG.getConstantFP(
15767 APFloat(Sem, APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15768 SDValue MagMask = DAG.getConstantFP(
15769 APFloat(Sem, ~APInt::getSignBit(EltSizeInBits)), dl, LogicVT);
15771 // First, clear all bits but the sign bit from the second operand (sign).
15773 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
15774 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
15776 // Next, clear the sign bit from the first operand (magnitude).
15777 // TODO: If we had general constant folding for FP logic ops, this check
15778 // wouldn't be necessary.
15780 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
15781 APFloat APF = Op0CN->getValueAPF();
15783 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
15785 // If the magnitude operand wasn't a constant, we need to AND out the sign.
15787 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
15788 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
15791 // OR the magnitude value with the sign bit.
15792 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
15793 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
15794 DAG.getIntPtrConstant(0, dl));
15797 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
15798 SDValue N0 = Op.getOperand(0);
15800 MVT VT = Op.getSimpleValueType();
15802 MVT OpVT = N0.getSimpleValueType();
15803 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
15804 "Unexpected type for FGETSIGN");
15806 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
15807 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
15808 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
15809 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
15810 Res = DAG.getZExtOrTrunc(Res, dl, VT);
15811 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
15815 // Check whether an OR'd tree is PTEST-able.
15816 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
15817 SelectionDAG &DAG) {
15818 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
15820 if (!Subtarget.hasSSE41())
15823 if (!Op->hasOneUse())
15826 SDNode *N = Op.getNode();
15829 SmallVector<SDValue, 8> Opnds;
15830 DenseMap<SDValue, unsigned> VecInMap;
15831 SmallVector<SDValue, 8> VecIns;
15832 EVT VT = MVT::Other;
15834 // Recognize a special case where a vector is casted into wide integer to
15836 Opnds.push_back(N->getOperand(0));
15837 Opnds.push_back(N->getOperand(1));
15839 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
15840 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
15841 // BFS traverse all OR'd operands.
15842 if (I->getOpcode() == ISD::OR) {
15843 Opnds.push_back(I->getOperand(0));
15844 Opnds.push_back(I->getOperand(1));
15845 // Re-evaluate the number of nodes to be traversed.
15846 e += 2; // 2 more nodes (LHS and RHS) are pushed.
15850 // Quit if a non-EXTRACT_VECTOR_ELT
15851 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
15854 // Quit if without a constant index.
15855 SDValue Idx = I->getOperand(1);
15856 if (!isa<ConstantSDNode>(Idx))
15859 SDValue ExtractedFromVec = I->getOperand(0);
15860 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
15861 if (M == VecInMap.end()) {
15862 VT = ExtractedFromVec.getValueType();
15863 // Quit if not 128/256-bit vector.
15864 if (!VT.is128BitVector() && !VT.is256BitVector())
15866 // Quit if not the same type.
15867 if (VecInMap.begin() != VecInMap.end() &&
15868 VT != VecInMap.begin()->first.getValueType())
15870 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
15871 VecIns.push_back(ExtractedFromVec);
15873 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
15876 assert((VT.is128BitVector() || VT.is256BitVector()) &&
15877 "Not extracted from 128-/256-bit vector.");
15879 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
15881 for (DenseMap<SDValue, unsigned>::const_iterator
15882 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
15883 // Quit if not all elements are used.
15884 if (I->second != FullMask)
15888 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
15890 // Cast all vectors into TestVT for PTEST.
15891 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
15892 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
15894 // If more than one full vectors are evaluated, OR them first before PTEST.
15895 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
15896 // Each iteration will OR 2 nodes and append the result until there is only
15897 // 1 node left, i.e. the final OR'd value of all vectors.
15898 SDValue LHS = VecIns[Slot];
15899 SDValue RHS = VecIns[Slot + 1];
15900 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
15903 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
15904 VecIns.back(), VecIns.back());
15907 /// \brief return true if \c Op has a use that doesn't just read flags.
15908 static bool hasNonFlagsUse(SDValue Op) {
15909 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
15911 SDNode *User = *UI;
15912 unsigned UOpNo = UI.getOperandNo();
15913 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
15914 // Look pass truncate.
15915 UOpNo = User->use_begin().getOperandNo();
15916 User = *User->use_begin();
15919 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
15920 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
15926 // Emit KTEST instruction for bit vectors on AVX-512
15927 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
15928 const X86Subtarget &Subtarget) {
15929 if (Op.getOpcode() == ISD::BITCAST) {
15930 auto hasKTEST = [&](MVT VT) {
15931 unsigned SizeInBits = VT.getSizeInBits();
15932 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
15933 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
15935 SDValue Op0 = Op.getOperand(0);
15936 MVT Op0VT = Op0.getValueType().getSimpleVT();
15937 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
15939 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
15944 /// Emit nodes that will be selected as "test Op0,Op0", or something
15946 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
15947 SelectionDAG &DAG) const {
15948 if (Op.getValueType() == MVT::i1) {
15949 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
15950 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
15951 DAG.getConstant(0, dl, MVT::i8));
15953 // CF and OF aren't always set the way we want. Determine which
15954 // of these we need.
15955 bool NeedCF = false;
15956 bool NeedOF = false;
15959 case X86::COND_A: case X86::COND_AE:
15960 case X86::COND_B: case X86::COND_BE:
15963 case X86::COND_G: case X86::COND_GE:
15964 case X86::COND_L: case X86::COND_LE:
15965 case X86::COND_O: case X86::COND_NO: {
15966 // Check if we really need to set the
15967 // Overflow flag. If NoSignedWrap is present
15968 // that is not actually needed.
15969 switch (Op->getOpcode()) {
15974 const auto *BinNode = cast<BinaryWithFlagsSDNode>(Op.getNode());
15975 if (BinNode->Flags.hasNoSignedWrap())
15985 // See if we can use the EFLAGS value from the operand instead of
15986 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
15987 // we prove that the arithmetic won't overflow, we can't use OF or CF.
15988 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
15989 // Emit KTEST for bit vectors
15990 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
15992 // Emit a CMP with 0, which is the TEST pattern.
15993 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
15994 DAG.getConstant(0, dl, Op.getValueType()));
15996 unsigned Opcode = 0;
15997 unsigned NumOperands = 0;
15999 // Truncate operations may prevent the merge of the SETCC instruction
16000 // and the arithmetic instruction before it. Attempt to truncate the operands
16001 // of the arithmetic instruction and use a reduced bit-width instruction.
16002 bool NeedTruncation = false;
16003 SDValue ArithOp = Op;
16004 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16005 SDValue Arith = Op->getOperand(0);
16006 // Both the trunc and the arithmetic op need to have one user each.
16007 if (Arith->hasOneUse())
16008 switch (Arith.getOpcode()) {
16015 NeedTruncation = true;
16021 // Sometimes flags can be set either with an AND or with an SRL/SHL
16022 // instruction. SRL/SHL variant should be preferred for masks longer than this
16024 const int ShiftToAndMaxMaskWidth = 32;
16025 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16027 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16028 // which may be the result of a CAST. We use the variable 'Op', which is the
16029 // non-casted variable when we check for possible users.
16030 switch (ArithOp.getOpcode()) {
16032 // Due to an isel shortcoming, be conservative if this add is likely to be
16033 // selected as part of a load-modify-store instruction. When the root node
16034 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16035 // uses of other nodes in the match, such as the ADD in this case. This
16036 // leads to the ADD being left around and reselected, with the result being
16037 // two adds in the output. Alas, even if none our users are stores, that
16038 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16039 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16040 // climbing the DAG back to the root, and it doesn't seem to be worth the
16042 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16043 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16044 if (UI->getOpcode() != ISD::CopyToReg &&
16045 UI->getOpcode() != ISD::SETCC &&
16046 UI->getOpcode() != ISD::STORE)
16049 if (ConstantSDNode *C =
16050 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16051 // An add of one will be selected as an INC.
16052 if (C->isOne() && !Subtarget.slowIncDec()) {
16053 Opcode = X86ISD::INC;
16058 // An add of negative one (subtract of one) will be selected as a DEC.
16059 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16060 Opcode = X86ISD::DEC;
16066 // Otherwise use a regular EFLAGS-setting add.
16067 Opcode = X86ISD::ADD;
16072 // If we have a constant logical shift that's only used in a comparison
16073 // against zero turn it into an equivalent AND. This allows turning it into
16074 // a TEST instruction later.
16075 if (ZeroCheck && Op->hasOneUse() &&
16076 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16077 EVT VT = Op.getValueType();
16078 unsigned BitWidth = VT.getSizeInBits();
16079 unsigned ShAmt = Op->getConstantOperandVal(1);
16080 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16082 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16083 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16084 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16085 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16087 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16088 DAG.getConstant(Mask, dl, VT));
16093 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16094 // because a TEST instruction will be better. However, AND should be
16095 // preferred if the instruction can be combined into ANDN.
16096 if (!hasNonFlagsUse(Op)) {
16097 SDValue Op0 = ArithOp->getOperand(0);
16098 SDValue Op1 = ArithOp->getOperand(1);
16099 EVT VT = ArithOp.getValueType();
16100 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16101 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16102 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16104 // If we cannot select an ANDN instruction, check if we can replace
16105 // AND+IMM64 with a shift before giving up. This is possible for masks
16106 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16107 if (!isProperAndn) {
16111 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16112 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16116 const APInt &Mask = CN->getAPIntValue();
16117 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16118 break; // Prefer TEST instruction.
16120 unsigned BitWidth = Mask.getBitWidth();
16121 unsigned LeadingOnes = Mask.countLeadingOnes();
16122 unsigned TrailingZeros = Mask.countTrailingZeros();
16124 if (LeadingOnes + TrailingZeros == BitWidth) {
16125 assert(TrailingZeros < VT.getSizeInBits() &&
16126 "Shift amount should be less than the type width");
16127 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16128 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16129 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16133 unsigned LeadingZeros = Mask.countLeadingZeros();
16134 unsigned TrailingOnes = Mask.countTrailingOnes();
16136 if (LeadingZeros + TrailingOnes == BitWidth) {
16137 assert(LeadingZeros < VT.getSizeInBits() &&
16138 "Shift amount should be less than the type width");
16139 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16140 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16141 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16152 // Due to the ISEL shortcoming noted above, be conservative if this op is
16153 // likely to be selected as part of a load-modify-store instruction.
16154 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16155 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16156 if (UI->getOpcode() == ISD::STORE)
16159 // Otherwise use a regular EFLAGS-setting instruction.
16160 switch (ArithOp.getOpcode()) {
16161 default: llvm_unreachable("unexpected operator!");
16162 case ISD::SUB: Opcode = X86ISD::SUB; break;
16163 case ISD::XOR: Opcode = X86ISD::XOR; break;
16164 case ISD::AND: Opcode = X86ISD::AND; break;
16166 if (!NeedTruncation && ZeroCheck) {
16167 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16170 Opcode = X86ISD::OR;
16184 return SDValue(Op.getNode(), 1);
16190 // If we found that truncation is beneficial, perform the truncation and
16192 if (NeedTruncation) {
16193 EVT VT = Op.getValueType();
16194 SDValue WideVal = Op->getOperand(0);
16195 EVT WideVT = WideVal.getValueType();
16196 unsigned ConvertedOp = 0;
16197 // Use a target machine opcode to prevent further DAGCombine
16198 // optimizations that may separate the arithmetic operations
16199 // from the setcc node.
16200 switch (WideVal.getOpcode()) {
16202 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16203 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16204 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16205 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16206 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16210 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16211 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16212 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16213 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16214 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16220 // Emit KTEST for bit vectors
16221 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16224 // Emit a CMP with 0, which is the TEST pattern.
16225 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16226 DAG.getConstant(0, dl, Op.getValueType()));
16228 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16229 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16231 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16232 DAG.ReplaceAllUsesWith(Op, New);
16233 return SDValue(New.getNode(), 1);
16236 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16238 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16239 const SDLoc &dl, SelectionDAG &DAG) const {
16240 if (isNullConstant(Op1))
16241 return EmitTest(Op0, X86CC, dl, DAG);
16243 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16244 "Unexpected comparison operation for MVT::i1 operands");
16246 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16247 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16248 // Only promote the compare up to I32 if it is a 16 bit operation
16249 // with an immediate. 16 bit immediates are to be avoided.
16250 if ((Op0.getValueType() == MVT::i16 &&
16251 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16252 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16253 !Subtarget.isAtom()) {
16254 unsigned ExtendOp =
16255 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16256 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16257 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16259 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16260 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16261 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16263 return SDValue(Sub.getNode(), 1);
16265 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16268 /// Convert a comparison if required by the subtarget.
16269 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16270 SelectionDAG &DAG) const {
16271 // If the subtarget does not support the FUCOMI instruction, floating-point
16272 // comparisons have to be converted.
16273 if (Subtarget.hasCMov() ||
16274 Cmp.getOpcode() != X86ISD::CMP ||
16275 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16276 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16279 // The instruction selector will select an FUCOM instruction instead of
16280 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16281 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16282 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16284 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16285 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16286 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16287 DAG.getConstant(8, dl, MVT::i8));
16288 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16290 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16291 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16292 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16295 /// Check if replacement of SQRT with RSQRT should be disabled.
16296 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16297 EVT VT = Op.getValueType();
16299 // We never want to use both SQRT and RSQRT instructions for the same input.
16300 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16304 return Subtarget.hasFastVectorFSQRT();
16305 return Subtarget.hasFastScalarFSQRT();
16308 /// The minimum architected relative accuracy is 2^-12. We need one
16309 /// Newton-Raphson step to have a good float result (24 bits of precision).
16310 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16311 SelectionDAG &DAG, int Enabled,
16312 int &RefinementSteps,
16313 bool &UseOneConstNR,
16314 bool Reciprocal) const {
16315 EVT VT = Op.getValueType();
16317 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16318 // TODO: Add support for AVX512 (v16f32).
16319 // It is likely not profitable to do this for f64 because a double-precision
16320 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16321 // instructions: convert to single, rsqrtss, convert back to double, refine
16322 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16323 // along with FMA, this could be a throughput win.
16324 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16325 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16326 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16327 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16328 RefinementSteps = 1;
16330 UseOneConstNR = false;
16331 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16336 /// The minimum architected relative accuracy is 2^-12. We need one
16337 /// Newton-Raphson step to have a good float result (24 bits of precision).
16338 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16340 int &RefinementSteps) const {
16341 EVT VT = Op.getValueType();
16343 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16344 // TODO: Add support for AVX512 (v16f32).
16345 // It is likely not profitable to do this for f64 because a double-precision
16346 // reciprocal estimate with refinement on x86 prior to FMA requires
16347 // 15 instructions: convert to single, rcpss, convert back to double, refine
16348 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16349 // along with FMA, this could be a throughput win.
16351 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16352 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16353 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16354 // Enable estimate codegen with 1 refinement step for vector division.
16355 // Scalar division estimates are disabled because they break too much
16356 // real-world code. These defaults are intended to match GCC behavior.
16357 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16360 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16361 RefinementSteps = 1;
16363 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16368 /// If we have at least two divisions that use the same divisor, convert to
16369 /// multplication by a reciprocal. This may need to be adjusted for a given
16370 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16371 /// This is because we still need one division to calculate the reciprocal and
16372 /// then we need two multiplies by that reciprocal as replacements for the
16373 /// original divisions.
16374 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16378 /// Helper for creating a X86ISD::SETCC node.
16379 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16380 SelectionDAG &DAG) {
16381 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16382 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16385 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16386 /// according to equal/not-equal condition code \p CC.
16387 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16388 const SDLoc &dl, SelectionDAG &DAG) {
16389 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16390 // instruction. Since the shift amount is in-range-or-undefined, we know
16391 // that doing a bittest on the i32 value is ok. We extend to i32 because
16392 // the encoding for the i16 version is larger than the i32 version.
16393 // Also promote i16 to i32 for performance / code size reason.
16394 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16395 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16397 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16398 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16399 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16400 // known to be zero.
16401 if (Src.getValueType() == MVT::i64 &&
16402 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16403 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16405 // If the operand types disagree, extend the shift amount to match. Since
16406 // BT ignores high bits (like shifts) we can use anyextend.
16407 if (Src.getValueType() != BitNo.getValueType())
16408 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16410 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16411 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16412 return getSETCC(Cond, BT, dl , DAG);
16415 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16416 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16417 const SDLoc &dl, SelectionDAG &DAG) {
16418 SDValue Op0 = And.getOperand(0);
16419 SDValue Op1 = And.getOperand(1);
16420 if (Op0.getOpcode() == ISD::TRUNCATE)
16421 Op0 = Op0.getOperand(0);
16422 if (Op1.getOpcode() == ISD::TRUNCATE)
16423 Op1 = Op1.getOperand(0);
16426 if (Op1.getOpcode() == ISD::SHL)
16427 std::swap(Op0, Op1);
16428 if (Op0.getOpcode() == ISD::SHL) {
16429 if (isOneConstant(Op0.getOperand(0))) {
16430 // If we looked past a truncate, check that it's only truncating away
16432 unsigned BitWidth = Op0.getValueSizeInBits();
16433 unsigned AndBitWidth = And.getValueSizeInBits();
16434 if (BitWidth > AndBitWidth) {
16436 DAG.computeKnownBits(Op0, Zeros, Ones);
16437 if (Zeros.countLeadingOnes() < BitWidth - AndBitWidth)
16441 RHS = Op0.getOperand(1);
16443 } else if (Op1.getOpcode() == ISD::Constant) {
16444 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16445 uint64_t AndRHSVal = AndRHS->getZExtValue();
16446 SDValue AndLHS = Op0;
16448 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16449 LHS = AndLHS.getOperand(0);
16450 RHS = AndLHS.getOperand(1);
16453 // Use BT if the immediate can't be encoded in a TEST instruction.
16454 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16456 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16461 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16466 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16467 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16468 const SDLoc &dl, SelectionDAG &DAG) {
16470 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16471 "Expected TRUNCATE to i1 node");
16473 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16476 SDValue ShiftRight = Op.getOperand(0);
16477 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16481 /// Result of 'and' or 'trunc to i1' is compared against zero.
16482 /// Change to a BT node if possible.
16483 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
16484 const SDLoc &dl, SelectionDAG &DAG) const {
16485 if (Op.getOpcode() == ISD::AND)
16486 return LowerAndToBT(Op, CC, dl, DAG);
16487 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
16488 return LowerTruncateToBT(Op, CC, dl, DAG);
16492 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
16494 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
16499 // SSE Condition code mapping:
16508 switch (SetCCOpcode) {
16509 default: llvm_unreachable("Unexpected SETCC condition");
16511 case ISD::SETEQ: SSECC = 0; break;
16513 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
16515 case ISD::SETOLT: SSECC = 1; break;
16517 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
16519 case ISD::SETOLE: SSECC = 2; break;
16520 case ISD::SETUO: SSECC = 3; break;
16522 case ISD::SETNE: SSECC = 4; break;
16523 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
16524 case ISD::SETUGE: SSECC = 5; break;
16525 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
16526 case ISD::SETUGT: SSECC = 6; break;
16527 case ISD::SETO: SSECC = 7; break;
16529 case ISD::SETONE: SSECC = 8; break;
16532 std::swap(Op0, Op1);
16537 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
16538 /// concatenate the result back.
16539 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
16540 MVT VT = Op.getSimpleValueType();
16542 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
16543 "Unsupported value type for operation");
16545 unsigned NumElems = VT.getVectorNumElements();
16547 SDValue CC = Op.getOperand(2);
16549 // Extract the LHS vectors
16550 SDValue LHS = Op.getOperand(0);
16551 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
16552 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
16554 // Extract the RHS vectors
16555 SDValue RHS = Op.getOperand(1);
16556 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
16557 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
16559 // Issue the operation on the smaller types and concatenate the result back
16560 MVT EltVT = VT.getVectorElementType();
16561 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
16562 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
16563 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
16564 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
16567 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16568 SDValue Op0 = Op.getOperand(0);
16569 SDValue Op1 = Op.getOperand(1);
16570 SDValue CC = Op.getOperand(2);
16571 MVT VT = Op.getSimpleValueType();
16574 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
16575 "Unexpected type for boolean compare operation");
16576 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16577 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
16578 DAG.getConstant(-1, dl, VT));
16579 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
16580 DAG.getConstant(-1, dl, VT));
16581 switch (SetCCOpcode) {
16582 default: llvm_unreachable("Unexpected SETCC condition");
16584 // (x == y) -> ~(x ^ y)
16585 return DAG.getNode(ISD::XOR, dl, VT,
16586 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
16587 DAG.getConstant(-1, dl, VT));
16589 // (x != y) -> (x ^ y)
16590 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
16593 // (x > y) -> (x & ~y)
16594 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
16597 // (x < y) -> (~x & y)
16598 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
16601 // (x <= y) -> (~x | y)
16602 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
16605 // (x >=y) -> (x | ~y)
16606 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
16610 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
16612 SDValue Op0 = Op.getOperand(0);
16613 SDValue Op1 = Op.getOperand(1);
16614 SDValue CC = Op.getOperand(2);
16615 MVT VT = Op.getSimpleValueType();
16618 assert(VT.getVectorElementType() == MVT::i1 &&
16619 "Cannot set masked compare for this operation");
16621 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16623 bool Unsigned = false;
16626 switch (SetCCOpcode) {
16627 default: llvm_unreachable("Unexpected SETCC condition");
16628 case ISD::SETNE: SSECC = 4; break;
16629 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
16630 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
16631 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
16632 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
16633 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
16634 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
16635 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
16636 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
16637 case ISD::SETLE: SSECC = 2; break;
16641 std::swap(Op0, Op1);
16643 return DAG.getNode(Opc, dl, VT, Op0, Op1);
16644 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
16645 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16646 DAG.getConstant(SSECC, dl, MVT::i8));
16649 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
16650 /// operand \p Op1. If non-trivial (for example because it's not constant)
16651 /// return an empty value.
16652 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
16653 SelectionDAG &DAG) {
16654 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
16658 MVT VT = Op1.getSimpleValueType();
16659 MVT EVT = VT.getVectorElementType();
16660 unsigned n = VT.getVectorNumElements();
16661 SmallVector<SDValue, 8> ULTOp1;
16663 for (unsigned i = 0; i < n; ++i) {
16664 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
16665 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
16668 // Avoid underflow.
16669 APInt Val = Elt->getAPIntValue();
16673 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
16676 return DAG.getBuildVector(VT, dl, ULTOp1);
16679 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
16680 SelectionDAG &DAG) {
16681 SDValue Op0 = Op.getOperand(0);
16682 SDValue Op1 = Op.getOperand(1);
16683 SDValue CC = Op.getOperand(2);
16684 MVT VT = Op.getSimpleValueType();
16685 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
16686 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
16691 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
16692 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
16696 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
16697 assert(VT.getVectorNumElements() <= 16);
16698 Opc = X86ISD::CMPM;
16700 Opc = X86ISD::CMPP;
16701 // The SSE/AVX packed FP comparison nodes are defined with a
16702 // floating-point vector result that matches the operand type. This allows
16703 // them to work with an SSE1 target (integer vector types are not legal).
16704 VT = Op0.getSimpleValueType();
16707 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
16708 // emit two comparisons and a logic op to tie them together.
16709 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
16712 unsigned SSECC = translateX86FSETCC(SetCCOpcode, Op0, Op1);
16714 // LLVM predicate is SETUEQ or SETONE.
16716 unsigned CombineOpc;
16717 if (SetCCOpcode == ISD::SETUEQ) {
16720 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
16721 static_cast<unsigned>(ISD::OR);
16723 assert(SetCCOpcode == ISD::SETONE);
16726 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
16727 static_cast<unsigned>(ISD::AND);
16730 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16731 DAG.getConstant(CC0, dl, MVT::i8));
16732 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
16733 DAG.getConstant(CC1, dl, MVT::i8));
16734 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
16736 // Handle all other FP comparisons here.
16737 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
16738 DAG.getConstant(SSECC, dl, MVT::i8));
16741 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
16742 // result type of SETCC. The bitcast is expected to be optimized away
16743 // during combining/isel.
16744 if (Opc == X86ISD::CMPP)
16745 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
16750 MVT VTOp0 = Op0.getSimpleValueType();
16751 assert(VTOp0 == Op1.getSimpleValueType() &&
16752 "Expected operands with same type!");
16753 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
16754 "Invalid number of packed elements for source and destination!");
16756 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
16757 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
16758 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
16759 // legalizer firstly checks if the first operand in input to the setcc has
16760 // a legal type. If so, then it promotes the return type to that same type.
16761 // Otherwise, the return type is promoted to the 'next legal type' which,
16762 // for a vector of MVT::i1 is always a 128-bit integer vector type.
16764 // We reach this code only if the following two conditions are met:
16765 // 1. Both return type and operand type have been promoted to wider types
16766 // by the type legalizer.
16767 // 2. The original operand type has been promoted to a 256-bit vector.
16769 // Note that condition 2. only applies for AVX targets.
16770 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, SetCCOpcode);
16771 return DAG.getZExtOrTrunc(NewOp, dl, VT);
16774 // The non-AVX512 code below works under the assumption that source and
16775 // destination types are the same.
16776 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
16777 "Value types for source and destination must be the same!");
16779 // Break 256-bit integer vector compare into smaller ones.
16780 if (VT.is256BitVector() && !Subtarget.hasInt256())
16781 return Lower256IntVSETCC(Op, DAG);
16783 // Operands are boolean (vectors of i1)
16784 MVT OpVT = Op1.getSimpleValueType();
16785 if (OpVT.getVectorElementType() == MVT::i1)
16786 return LowerBoolVSETCC_AVX512(Op, DAG);
16788 // The result is boolean, but operands are int/float
16789 if (VT.getVectorElementType() == MVT::i1) {
16790 // In AVX-512 architecture setcc returns mask with i1 elements,
16791 // But there is no compare instruction for i8 and i16 elements in KNL.
16792 // In this case use SSE compare
16793 bool UseAVX512Inst =
16794 (OpVT.is512BitVector() ||
16795 OpVT.getScalarSizeInBits() >= 32 ||
16796 (Subtarget.hasBWI() && Subtarget.hasVLX()));
16799 return LowerIntVSETCC_AVX512(Op, DAG);
16801 return DAG.getNode(ISD::TRUNCATE, dl, VT,
16802 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
16805 // Lower using XOP integer comparisons.
16806 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
16807 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
16808 // Translate compare code to XOP PCOM compare mode.
16809 unsigned CmpMode = 0;
16810 switch (SetCCOpcode) {
16811 default: llvm_unreachable("Unexpected SETCC condition");
16813 case ISD::SETLT: CmpMode = 0x00; break;
16815 case ISD::SETLE: CmpMode = 0x01; break;
16817 case ISD::SETGT: CmpMode = 0x02; break;
16819 case ISD::SETGE: CmpMode = 0x03; break;
16820 case ISD::SETEQ: CmpMode = 0x04; break;
16821 case ISD::SETNE: CmpMode = 0x05; break;
16824 // Are we comparing unsigned or signed integers?
16825 unsigned Opc = ISD::isUnsignedIntSetCC(SetCCOpcode)
16826 ? X86ISD::VPCOMU : X86ISD::VPCOM;
16828 return DAG.getNode(Opc, dl, VT, Op0, Op1,
16829 DAG.getConstant(CmpMode, dl, MVT::i8));
16832 // We are handling one of the integer comparisons here. Since SSE only has
16833 // GT and EQ comparisons for integer, swapping operands and multiple
16834 // operations may be required for some comparisons.
16836 bool Swap = false, Invert = false, FlipSigns = false, MinMax = false;
16837 bool Subus = false;
16839 switch (SetCCOpcode) {
16840 default: llvm_unreachable("Unexpected SETCC condition");
16841 case ISD::SETNE: Invert = true;
16842 case ISD::SETEQ: Opc = X86ISD::PCMPEQ; break;
16843 case ISD::SETLT: Swap = true;
16844 case ISD::SETGT: Opc = X86ISD::PCMPGT; break;
16845 case ISD::SETGE: Swap = true;
16846 case ISD::SETLE: Opc = X86ISD::PCMPGT;
16847 Invert = true; break;
16848 case ISD::SETULT: Swap = true;
16849 case ISD::SETUGT: Opc = X86ISD::PCMPGT;
16850 FlipSigns = true; break;
16851 case ISD::SETUGE: Swap = true;
16852 case ISD::SETULE: Opc = X86ISD::PCMPGT;
16853 FlipSigns = true; Invert = true; break;
16856 // Special case: Use min/max operations for SETULE/SETUGE
16857 MVT VET = VT.getVectorElementType();
16859 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32))
16860 || (Subtarget.hasSSE2() && (VET == MVT::i8));
16863 switch (SetCCOpcode) {
16865 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
16866 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
16869 if (MinMax) { Swap = false; Invert = false; FlipSigns = false; }
16872 bool hasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
16873 if (!MinMax && hasSubus) {
16874 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
16876 // t = psubus Op0, Op1
16877 // pcmpeq t, <0..0>
16878 switch (SetCCOpcode) {
16880 case ISD::SETULT: {
16881 // If the comparison is against a constant we can turn this into a
16882 // setule. With psubus, setule does not require a swap. This is
16883 // beneficial because the constant in the register is no longer
16884 // destructed as the destination so it can be hoisted out of a loop.
16885 // Only do this pre-AVX since vpcmp* is no longer destructive.
16886 if (Subtarget.hasAVX())
16888 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
16890 Subus = true; Invert = false; Swap = false;
16894 // Psubus is better than flip-sign because it requires no inversion.
16895 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
16896 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
16900 Opc = X86ISD::SUBUS;
16906 std::swap(Op0, Op1);
16908 // Check that the operation in question is available (most are plain SSE2,
16909 // but PCMPGTQ and PCMPEQQ have different requirements).
16910 if (VT == MVT::v2i64) {
16911 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
16912 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
16914 // First cast everything to the right type.
16915 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16916 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16918 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16919 // bits of the inputs before performing those operations. The lower
16920 // compare is always unsigned.
16923 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
16925 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
16926 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
16927 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
16929 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
16930 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
16932 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
16933 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
16934 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
16936 // Create masks for only the low parts/high parts of the 64 bit integers.
16937 static const int MaskHi[] = { 1, 1, 3, 3 };
16938 static const int MaskLo[] = { 0, 0, 2, 2 };
16939 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
16940 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
16941 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
16943 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
16944 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
16947 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16949 return DAG.getBitcast(VT, Result);
16952 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
16953 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
16954 // pcmpeqd + pshufd + pand.
16955 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
16957 // First cast everything to the right type.
16958 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
16959 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
16962 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
16964 // Make sure the lower and upper halves are both all-ones.
16965 static const int Mask[] = { 1, 0, 3, 2 };
16966 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
16967 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
16970 Result = DAG.getNOT(dl, Result, MVT::v4i32);
16972 return DAG.getBitcast(VT, Result);
16976 // Since SSE has no unsigned integer comparisons, we need to flip the sign
16977 // bits of the inputs before performing those operations.
16979 MVT EltVT = VT.getVectorElementType();
16980 SDValue SB = DAG.getConstant(APInt::getSignBit(EltVT.getSizeInBits()), dl,
16982 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SB);
16983 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SB);
16986 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
16988 // If the logical-not of the result is required, perform that now.
16990 Result = DAG.getNOT(dl, Result, VT);
16993 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
16996 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
16997 getZeroVector(VT, Subtarget, DAG, dl));
17002 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17004 MVT VT = Op.getSimpleValueType();
17006 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17008 assert(((!Subtarget.hasAVX512() && VT == MVT::i8) || (VT == MVT::i1))
17009 && "SetCC type must be 8-bit or 1-bit integer");
17010 SDValue Op0 = Op.getOperand(0);
17011 SDValue Op1 = Op.getOperand(1);
17013 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17015 // Optimize to BT if possible.
17016 // Lower (X & (1 << N)) == 0 to BT(X, N).
17017 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17018 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17019 // Lower (trunc (X >> N) to i1) to BT(X, N).
17020 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17021 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17022 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17024 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17029 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17031 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17032 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17034 // If the input is a setcc, then reuse the input setcc or use a new one with
17035 // the inverted condition.
17036 if (Op0.getOpcode() == X86ISD::SETCC) {
17037 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17038 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17042 CCode = X86::GetOppositeBranchCondition(CCode);
17043 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17045 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17049 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17050 if (isOneConstant(Op1)) {
17051 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17052 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17054 if (!isNullConstant(Op1)) {
17055 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17056 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17060 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17061 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17062 if (X86CC == X86::COND_INVALID)
17065 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17066 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17067 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17069 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17073 SDValue X86TargetLowering::LowerSETCCE(SDValue Op, SelectionDAG &DAG) const {
17074 SDValue LHS = Op.getOperand(0);
17075 SDValue RHS = Op.getOperand(1);
17076 SDValue Carry = Op.getOperand(2);
17077 SDValue Cond = Op.getOperand(3);
17080 assert(LHS.getSimpleValueType().isInteger() && "SETCCE is integer only.");
17081 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17083 assert(Carry.getOpcode() != ISD::CARRY_FALSE);
17084 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17085 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry);
17086 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17087 if (Op.getSimpleValueType() == MVT::i1)
17088 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17092 /// Return true if opcode is a X86 logical comparison.
17093 static bool isX86LogicalCmp(SDValue Op) {
17094 unsigned Opc = Op.getOpcode();
17095 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17096 Opc == X86ISD::SAHF)
17098 if (Op.getResNo() == 1 &&
17099 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17100 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17101 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17102 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17105 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17111 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17112 if (V.getOpcode() != ISD::TRUNCATE)
17115 SDValue VOp0 = V.getOperand(0);
17116 unsigned InBits = VOp0.getValueSizeInBits();
17117 unsigned Bits = V.getValueSizeInBits();
17118 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17121 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17122 bool AddTest = true;
17123 SDValue Cond = Op.getOperand(0);
17124 SDValue Op1 = Op.getOperand(1);
17125 SDValue Op2 = Op.getOperand(2);
17127 MVT VT = Op1.getSimpleValueType();
17130 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17131 // are available or VBLENDV if AVX is available.
17132 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17133 if (Cond.getOpcode() == ISD::SETCC &&
17134 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17135 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17136 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17137 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17138 int SSECC = translateX86FSETCC(
17139 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17142 if (Subtarget.hasAVX512()) {
17143 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CondOp0,
17144 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17145 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17146 DL, VT, Cmp, Op1, Op2);
17149 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17150 DAG.getConstant(SSECC, DL, MVT::i8));
17152 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17153 // of 3 logic instructions for size savings and potentially speed.
17154 // Unfortunately, there is no scalar form of VBLENDV.
17156 // If either operand is a constant, don't try this. We can expect to
17157 // optimize away at least one of the logic instructions later in that
17158 // case, so that sequence would be faster than a variable blend.
17160 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17161 // uses XMM0 as the selection register. That may need just as many
17162 // instructions as the AND/ANDN/OR sequence due to register moves, so
17165 if (Subtarget.hasAVX() &&
17166 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17168 // Convert to vectors, do a VSELECT, and convert back to scalar.
17169 // All of the conversions should be optimized away.
17171 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17172 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17173 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17174 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17176 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17177 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17179 SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2);
17181 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17182 VSel, DAG.getIntPtrConstant(0, DL));
17184 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17185 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17186 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17190 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17191 if (Cond.getValueType() == MVT::i1 && (VT == MVT::f64 || VT == MVT::f32) &&
17192 Subtarget.hasAVX512())
17193 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cond, Op1, Op2);
17195 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17197 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17198 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17199 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17200 Op1Scalar = Op1.getOperand(0);
17202 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17203 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17204 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17205 Op2Scalar = Op2.getOperand(0);
17206 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17207 SDValue newSelect = DAG.getNode(ISD::SELECT, DL,
17208 Op1Scalar.getValueType(),
17209 Cond, Op1Scalar, Op2Scalar);
17210 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17211 return DAG.getBitcast(VT, newSelect);
17212 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17213 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17214 DAG.getIntPtrConstant(0, DL));
17218 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17219 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17220 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17221 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17222 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17223 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17224 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1,
17226 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17229 if (Cond.getOpcode() == ISD::SETCC) {
17230 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17232 // If the condition was updated, it's possible that the operands of the
17233 // select were also updated (for example, EmitTest has a RAUW). Refresh
17234 // the local references to the select operands in case they got stale.
17235 Op1 = Op.getOperand(1);
17236 Op2 = Op.getOperand(2);
17240 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17241 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17242 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17243 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17244 if (Cond.getOpcode() == X86ISD::SETCC &&
17245 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17246 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17247 SDValue Cmp = Cond.getOperand(1);
17249 unsigned CondCode =cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17251 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17252 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17253 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17255 SDValue CmpOp0 = Cmp.getOperand(0);
17256 // Apply further optimizations for special cases
17257 // (select (x != 0), -1, 0) -> neg & sbb
17258 // (select (x == 0), 0, -1) -> neg & sbb
17259 if (isNullConstant(Y) &&
17260 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17261 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17262 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs,
17263 DAG.getConstant(0, DL,
17264 CmpOp0.getValueType()),
17266 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17267 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17268 SDValue(Neg.getNode(), 1));
17272 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17273 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17274 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17276 SDValue Res = // Res = 0 or -1.
17277 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17278 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17280 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17281 Res = DAG.getNOT(DL, Res, Res.getValueType());
17283 if (!isNullConstant(Op2))
17284 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17289 // Look past (and (setcc_carry (cmp ...)), 1).
17290 if (Cond.getOpcode() == ISD::AND &&
17291 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17292 isOneConstant(Cond.getOperand(1)))
17293 Cond = Cond.getOperand(0);
17295 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17296 // setting operand in place of the X86ISD::SETCC.
17297 unsigned CondOpcode = Cond.getOpcode();
17298 if (CondOpcode == X86ISD::SETCC ||
17299 CondOpcode == X86ISD::SETCC_CARRY) {
17300 CC = Cond.getOperand(0);
17302 SDValue Cmp = Cond.getOperand(1);
17303 unsigned Opc = Cmp.getOpcode();
17304 MVT VT = Op.getSimpleValueType();
17306 bool IllegalFPCMov = false;
17307 if (VT.isFloatingPoint() && !VT.isVector() &&
17308 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17309 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17311 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17312 Opc == X86ISD::BT) { // FIXME
17316 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17317 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17318 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17319 Cond.getOperand(0).getValueType() != MVT::i8)) {
17320 SDValue LHS = Cond.getOperand(0);
17321 SDValue RHS = Cond.getOperand(1);
17322 unsigned X86Opcode;
17325 switch (CondOpcode) {
17326 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17327 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17328 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17329 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17330 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17331 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17332 default: llvm_unreachable("unexpected overflowing operator");
17334 if (CondOpcode == ISD::UMULO)
17335 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17338 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17340 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17342 if (CondOpcode == ISD::UMULO)
17343 Cond = X86Op.getValue(2);
17345 Cond = X86Op.getValue(1);
17347 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17352 // Look past the truncate if the high bits are known zero.
17353 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17354 Cond = Cond.getOperand(0);
17356 // We know the result of AND is compared against zero. Try to match
17358 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17359 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17360 CC = NewSetCC.getOperand(0);
17361 Cond = NewSetCC.getOperand(1);
17368 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17369 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17372 // a < b ? -1 : 0 -> RES = ~setcc_carry
17373 // a < b ? 0 : -1 -> RES = setcc_carry
17374 // a >= b ? -1 : 0 -> RES = setcc_carry
17375 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17376 if (Cond.getOpcode() == X86ISD::SUB) {
17377 Cond = ConvertCmpIfNecessary(Cond, DAG);
17378 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17380 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17381 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17382 (isNullConstant(Op1) || isNullConstant(Op2))) {
17383 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17384 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17386 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17387 return DAG.getNOT(DL, Res, Res.getValueType());
17392 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17393 // widen the cmov and push the truncate through. This avoids introducing a new
17394 // branch during isel and doesn't add any extensions.
17395 if (Op.getValueType() == MVT::i8 &&
17396 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17397 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17398 if (T1.getValueType() == T2.getValueType() &&
17399 // Blacklist CopyFromReg to avoid partial register stalls.
17400 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17401 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17402 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17403 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17407 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17408 // condition is true.
17409 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17410 SDValue Ops[] = { Op2, Op1, CC, Cond };
17411 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17414 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17415 const X86Subtarget &Subtarget,
17416 SelectionDAG &DAG) {
17417 MVT VT = Op->getSimpleValueType(0);
17418 SDValue In = Op->getOperand(0);
17419 MVT InVT = In.getSimpleValueType();
17420 MVT VTElt = VT.getVectorElementType();
17421 MVT InVTElt = InVT.getVectorElementType();
17425 if ((InVTElt == MVT::i1) &&
17426 (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
17427 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
17429 ((Subtarget.hasBWI() && VT.is512BitVector() &&
17430 VTElt.getSizeInBits() <= 16)) ||
17432 ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
17433 VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
17435 ((Subtarget.hasDQI() && VT.is512BitVector() &&
17436 VTElt.getSizeInBits() >= 32))))
17437 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17439 unsigned NumElts = VT.getVectorNumElements();
17441 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17442 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17443 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17444 return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
17445 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17448 if (InVTElt != MVT::i1)
17452 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17453 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
17456 if (Subtarget.hasDQI()) {
17457 V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
17458 assert(!VT.is512BitVector() && "Unexpected vector type");
17460 SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
17461 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
17462 V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
17467 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
17470 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
17471 // For sign extend this needs to handle all vector sizes and SSE4.1 and
17472 // non-SSE4.1 targets. For zero extend this should only handle inputs of
17473 // MVT::v64i8 when BWI is not supported, but AVX512 is.
17474 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
17475 const X86Subtarget &Subtarget,
17476 SelectionDAG &DAG) {
17477 SDValue In = Op->getOperand(0);
17478 MVT VT = Op->getSimpleValueType(0);
17479 MVT InVT = In.getSimpleValueType();
17480 assert(VT.getSizeInBits() == InVT.getSizeInBits());
17482 MVT SVT = VT.getVectorElementType();
17483 MVT InSVT = InVT.getVectorElementType();
17484 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
17486 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
17488 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
17490 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
17491 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
17492 !(VT.is512BitVector() && Subtarget.hasAVX512()))
17497 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
17498 // For 512-bit vectors, we need 128-bits or 256-bits.
17499 if (VT.getSizeInBits() > 128) {
17500 // Input needs to be at least the same number of elements as output, and
17501 // at least 128-bits.
17502 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
17503 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
17506 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
17507 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
17509 // SSE41 targets can use the pmovsx* instructions directly.
17510 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
17511 X86ISD::VSEXT : X86ISD::VZEXT;
17512 if (Subtarget.hasSSE41())
17513 return DAG.getNode(ExtOpc, dl, VT, In);
17515 // We should only get here for sign extend.
17516 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
17517 "Unexpected opcode!");
17519 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
17523 // As SRAI is only available on i16/i32 types, we expand only up to i32
17524 // and handle i64 separately.
17525 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
17526 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
17527 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
17528 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
17529 Curr = DAG.getBitcast(CurrVT, Curr);
17532 SDValue SignExt = Curr;
17533 if (CurrVT != InVT) {
17534 unsigned SignExtShift =
17535 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
17536 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17537 DAG.getConstant(SignExtShift, dl, MVT::i8));
17543 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
17544 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
17545 DAG.getConstant(31, dl, MVT::i8));
17546 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
17547 return DAG.getBitcast(VT, Ext);
17553 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
17554 SelectionDAG &DAG) {
17555 MVT VT = Op->getSimpleValueType(0);
17556 SDValue In = Op->getOperand(0);
17557 MVT InVT = In.getSimpleValueType();
17560 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
17561 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
17563 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
17564 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
17565 (VT != MVT::v16i16 || InVT != MVT::v16i8))
17568 if (Subtarget.hasInt256())
17569 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17571 // Optimize vectors in AVX mode
17572 // Sign extend v8i16 to v8i32 and
17575 // Divide input vector into two parts
17576 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
17577 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
17578 // concat the vectors to original VT
17580 unsigned NumElems = InVT.getVectorNumElements();
17581 SDValue Undef = DAG.getUNDEF(InVT);
17583 SmallVector<int,8> ShufMask1(NumElems, -1);
17584 for (unsigned i = 0; i != NumElems/2; ++i)
17587 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
17589 SmallVector<int,8> ShufMask2(NumElems, -1);
17590 for (unsigned i = 0; i != NumElems/2; ++i)
17591 ShufMask2[i] = i + NumElems/2;
17593 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
17595 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
17596 VT.getVectorNumElements() / 2);
17598 OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
17599 OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
17601 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
17604 // Lower truncating store. We need a special lowering to vXi1 vectors
17605 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
17606 SelectionDAG &DAG) {
17607 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
17609 EVT MemVT = St->getMemoryVT();
17610 assert(St->isTruncatingStore() && "We only custom truncating store.");
17611 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
17612 "Expected truncstore of i1 vector");
17614 SDValue Op = St->getValue();
17615 MVT OpVT = Op.getValueType().getSimpleVT();
17616 unsigned NumElts = OpVT.getVectorNumElements();
17617 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17619 // Truncate and store - everything is legal
17620 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
17621 if (MemVT.getSizeInBits() < 8)
17622 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
17623 DAG.getUNDEF(MVT::v8i1), Op,
17624 DAG.getIntPtrConstant(0, dl));
17625 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17626 St->getMemOperand());
17629 // A subset, assume that we have only AVX-512F
17630 if (NumElts <= 8) {
17632 // Extend to 8-elts vector
17633 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
17634 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
17635 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
17637 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
17638 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
17639 St->getMemOperand());
17642 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
17643 // Divide the vector into 2 parts and store each part separately
17644 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17645 DAG.getIntPtrConstant(0, dl));
17646 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
17647 SDValue BasePtr = St->getBasePtr();
17648 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
17649 St->getMemOperand());
17650 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
17651 DAG.getIntPtrConstant(16, dl));
17652 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
17654 SDValue BasePtrHi =
17655 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17656 DAG.getConstant(2, dl, BasePtr.getValueType()));
17658 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
17659 BasePtrHi, St->getMemOperand());
17660 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
17663 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
17664 const X86Subtarget &Subtarget,
17665 SelectionDAG &DAG) {
17667 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17669 EVT MemVT = Ld->getMemoryVT();
17670 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
17671 "Expected i1 vector load");
17672 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
17673 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
17674 MVT VT = Op.getValueType().getSimpleVT();
17675 unsigned NumElts = VT.getVectorNumElements();
17677 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
17679 // Load and extend - everything is legal
17681 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
17683 Ld->getMemOperand());
17684 // Replace chain users with the new chain.
17685 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17686 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17687 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17688 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
17690 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17691 DAG.getIntPtrConstant(0, dl));
17693 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
17695 Ld->getMemOperand());
17696 // Replace chain users with the new chain.
17697 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17698 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17700 // Finally, do a normal sign-extend to the desired register.
17701 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
17704 if (NumElts <= 8) {
17705 // A subset, assume that we have only AVX-512F
17706 unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
17707 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
17708 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
17710 Ld->getMemOperand());
17711 // Replace chain users with the new chain.
17712 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17713 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17715 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
17716 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
17719 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
17721 // we should take care to v4i1 and v2i1
17723 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
17724 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
17725 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
17726 DAG.getIntPtrConstant(0, dl));
17729 assert(VT == MVT::v32i8 && "Unexpected extload type");
17731 SmallVector<SDValue, 2> Chains;
17733 SDValue BasePtr = Ld->getBasePtr();
17734 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17736 Ld->getMemOperand());
17737 Chains.push_back(LoadLo.getValue(1));
17739 SDValue BasePtrHi =
17740 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
17741 DAG.getConstant(2, dl, BasePtr.getValueType()));
17743 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
17745 Ld->getMemOperand());
17746 Chains.push_back(LoadHi.getValue(1));
17747 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17748 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
17750 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
17751 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
17752 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
17755 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
17756 // may emit an illegal shuffle but the expansion is still better than scalar
17757 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
17758 // we'll emit a shuffle and a arithmetic shift.
17759 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
17760 // TODO: It is possible to support ZExt by zeroing the undef values during
17761 // the shuffle phase or after the shuffle.
17762 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
17763 SelectionDAG &DAG) {
17764 MVT RegVT = Op.getSimpleValueType();
17765 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
17766 assert(RegVT.isInteger() &&
17767 "We only custom lower integer vector sext loads.");
17769 // Nothing useful we can do without SSE2 shuffles.
17770 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
17772 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
17774 EVT MemVT = Ld->getMemoryVT();
17775 if (MemVT.getScalarType() == MVT::i1)
17776 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
17778 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17779 unsigned RegSz = RegVT.getSizeInBits();
17781 ISD::LoadExtType Ext = Ld->getExtensionType();
17783 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
17784 && "Only anyext and sext are currently implemented.");
17785 assert(MemVT != RegVT && "Cannot extend to the same type");
17786 assert(MemVT.isVector() && "Must load a vector from memory");
17788 unsigned NumElems = RegVT.getVectorNumElements();
17789 unsigned MemSz = MemVT.getSizeInBits();
17790 assert(RegSz > MemSz && "Register size must be greater than the mem size");
17792 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
17793 // The only way in which we have a legal 256-bit vector result but not the
17794 // integer 256-bit operations needed to directly lower a sextload is if we
17795 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
17796 // a 128-bit vector and a normal sign_extend to 256-bits that should get
17797 // correctly legalized. We do this late to allow the canonical form of
17798 // sextload to persist throughout the rest of the DAG combiner -- it wants
17799 // to fold together any extensions it can, and so will fuse a sign_extend
17800 // of an sextload into a sextload targeting a wider value.
17802 if (MemSz == 128) {
17803 // Just switch this to a normal load.
17804 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
17805 "it must be a legal 128-bit vector "
17807 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
17808 Ld->getPointerInfo(), Ld->getAlignment(),
17809 Ld->getMemOperand()->getFlags());
17811 assert(MemSz < 128 &&
17812 "Can't extend a type wider than 128 bits to a 256 bit vector!");
17813 // Do an sext load to a 128-bit vector type. We want to use the same
17814 // number of elements, but elements half as wide. This will end up being
17815 // recursively lowered by this routine, but will succeed as we definitely
17816 // have all the necessary features if we're using AVX1.
17818 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
17819 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
17821 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
17822 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
17823 Ld->getMemOperand()->getFlags());
17826 // Replace chain users with the new chain.
17827 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
17828 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
17830 // Finally, do a normal sign-extend to the desired register.
17831 return DAG.getSExtOrTrunc(Load, dl, RegVT);
17834 // All sizes must be a power of two.
17835 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
17836 "Non-power-of-two elements are not custom lowered!");
17838 // Attempt to load the original value using scalar loads.
17839 // Find the largest scalar type that divides the total loaded size.
17840 MVT SclrLoadTy = MVT::i8;
17841 for (MVT Tp : MVT::integer_valuetypes()) {
17842 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
17847 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
17848 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
17850 SclrLoadTy = MVT::f64;
17852 // Calculate the number of scalar loads that we need to perform
17853 // in order to load our vector from memory.
17854 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
17856 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
17857 "Can only lower sext loads with a single scalar load!");
17859 unsigned loadRegZize = RegSz;
17860 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
17863 // Represent our vector as a sequence of elements which are the
17864 // largest scalar that we can load.
17865 EVT LoadUnitVecVT = EVT::getVectorVT(
17866 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
17868 // Represent the data using the same element type that is stored in
17869 // memory. In practice, we ''widen'' MemVT.
17871 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
17872 loadRegZize / MemVT.getScalarSizeInBits());
17874 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
17875 "Invalid vector type");
17877 // We can't shuffle using an illegal type.
17878 assert(TLI.isTypeLegal(WideVecVT) &&
17879 "We only lower types that form legal widened vector types");
17881 SmallVector<SDValue, 8> Chains;
17882 SDValue Ptr = Ld->getBasePtr();
17883 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
17884 TLI.getPointerTy(DAG.getDataLayout()));
17885 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
17887 for (unsigned i = 0; i < NumLoads; ++i) {
17888 // Perform a single load.
17889 SDValue ScalarLoad =
17890 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
17891 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
17892 Chains.push_back(ScalarLoad.getValue(1));
17893 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
17894 // another round of DAGCombining.
17896 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
17898 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
17899 ScalarLoad, DAG.getIntPtrConstant(i, dl));
17901 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
17904 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
17906 // Bitcast the loaded value to a vector of the original element type, in
17907 // the size of the target vector type.
17908 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
17909 unsigned SizeRatio = RegSz / MemSz;
17911 if (Ext == ISD::SEXTLOAD) {
17912 // If we have SSE4.1, we can directly emit a VSEXT node.
17913 if (Subtarget.hasSSE41()) {
17914 SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
17915 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17919 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
17921 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
17922 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
17924 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
17925 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17929 // Redistribute the loaded elements into the different locations.
17930 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
17931 for (unsigned i = 0; i != NumElems; ++i)
17932 ShuffleVec[i * SizeRatio] = i;
17934 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
17935 DAG.getUNDEF(WideVecVT), ShuffleVec);
17937 // Bitcast to the requested type.
17938 Shuff = DAG.getBitcast(RegVT, Shuff);
17939 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
17943 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
17944 /// each of which has no other use apart from the AND / OR.
17945 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
17946 Opc = Op.getOpcode();
17947 if (Opc != ISD::OR && Opc != ISD::AND)
17949 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17950 Op.getOperand(0).hasOneUse() &&
17951 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
17952 Op.getOperand(1).hasOneUse());
17955 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
17956 /// SETCC node has a single use.
17957 static bool isXor1OfSetCC(SDValue Op) {
17958 if (Op.getOpcode() != ISD::XOR)
17960 if (isOneConstant(Op.getOperand(1)))
17961 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
17962 Op.getOperand(0).hasOneUse();
17966 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
17967 bool addTest = true;
17968 SDValue Chain = Op.getOperand(0);
17969 SDValue Cond = Op.getOperand(1);
17970 SDValue Dest = Op.getOperand(2);
17973 bool Inverted = false;
17975 if (Cond.getOpcode() == ISD::SETCC) {
17976 // Check for setcc([su]{add,sub,mul}o == 0).
17977 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
17978 isNullConstant(Cond.getOperand(1)) &&
17979 Cond.getOperand(0).getResNo() == 1 &&
17980 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
17981 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
17982 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
17983 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
17984 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
17985 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
17987 Cond = Cond.getOperand(0);
17989 if (SDValue NewCond = LowerSETCC(Cond, DAG))
17994 // FIXME: LowerXALUO doesn't handle these!!
17995 else if (Cond.getOpcode() == X86ISD::ADD ||
17996 Cond.getOpcode() == X86ISD::SUB ||
17997 Cond.getOpcode() == X86ISD::SMUL ||
17998 Cond.getOpcode() == X86ISD::UMUL)
17999 Cond = LowerXALUO(Cond, DAG);
18002 // Look pass (and (setcc_carry (cmp ...)), 1).
18003 if (Cond.getOpcode() == ISD::AND &&
18004 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18005 isOneConstant(Cond.getOperand(1)))
18006 Cond = Cond.getOperand(0);
18008 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18009 // setting operand in place of the X86ISD::SETCC.
18010 unsigned CondOpcode = Cond.getOpcode();
18011 if (CondOpcode == X86ISD::SETCC ||
18012 CondOpcode == X86ISD::SETCC_CARRY) {
18013 CC = Cond.getOperand(0);
18015 SDValue Cmp = Cond.getOperand(1);
18016 unsigned Opc = Cmp.getOpcode();
18017 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18018 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18022 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18026 // These can only come from an arithmetic instruction with overflow,
18027 // e.g. SADDO, UADDO.
18028 Cond = Cond.getOperand(1);
18034 CondOpcode = Cond.getOpcode();
18035 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18036 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18037 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18038 Cond.getOperand(0).getValueType() != MVT::i8)) {
18039 SDValue LHS = Cond.getOperand(0);
18040 SDValue RHS = Cond.getOperand(1);
18041 unsigned X86Opcode;
18044 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18045 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18047 switch (CondOpcode) {
18048 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18050 if (isOneConstant(RHS)) {
18051 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18054 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18055 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18057 if (isOneConstant(RHS)) {
18058 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18061 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18062 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18063 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18064 default: llvm_unreachable("unexpected overflowing operator");
18067 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18068 if (CondOpcode == ISD::UMULO)
18069 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18072 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18074 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18076 if (CondOpcode == ISD::UMULO)
18077 Cond = X86Op.getValue(2);
18079 Cond = X86Op.getValue(1);
18081 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18085 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18086 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18087 if (CondOpc == ISD::OR) {
18088 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18089 // two branches instead of an explicit OR instruction with a
18091 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18092 isX86LogicalCmp(Cmp)) {
18093 CC = Cond.getOperand(0).getOperand(0);
18094 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18095 Chain, Dest, CC, Cmp);
18096 CC = Cond.getOperand(1).getOperand(0);
18100 } else { // ISD::AND
18101 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18102 // two branches instead of an explicit AND instruction with a
18103 // separate test. However, we only do this if this block doesn't
18104 // have a fall-through edge, because this requires an explicit
18105 // jmp when the condition is false.
18106 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18107 isX86LogicalCmp(Cmp) &&
18108 Op.getNode()->hasOneUse()) {
18109 X86::CondCode CCode =
18110 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18111 CCode = X86::GetOppositeBranchCondition(CCode);
18112 CC = DAG.getConstant(CCode, dl, MVT::i8);
18113 SDNode *User = *Op.getNode()->use_begin();
18114 // Look for an unconditional branch following this conditional branch.
18115 // We need this because we need to reverse the successors in order
18116 // to implement FCMP_OEQ.
18117 if (User->getOpcode() == ISD::BR) {
18118 SDValue FalseBB = User->getOperand(1);
18120 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18121 assert(NewBR == User);
18125 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18126 Chain, Dest, CC, Cmp);
18127 X86::CondCode CCode =
18128 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18129 CCode = X86::GetOppositeBranchCondition(CCode);
18130 CC = DAG.getConstant(CCode, dl, MVT::i8);
18136 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18137 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18138 // It should be transformed during dag combiner except when the condition
18139 // is set by a arithmetics with overflow node.
18140 X86::CondCode CCode =
18141 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18142 CCode = X86::GetOppositeBranchCondition(CCode);
18143 CC = DAG.getConstant(CCode, dl, MVT::i8);
18144 Cond = Cond.getOperand(0).getOperand(1);
18146 } else if (Cond.getOpcode() == ISD::SETCC &&
18147 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18148 // For FCMP_OEQ, we can emit
18149 // two branches instead of an explicit AND instruction with a
18150 // separate test. However, we only do this if this block doesn't
18151 // have a fall-through edge, because this requires an explicit
18152 // jmp when the condition is false.
18153 if (Op.getNode()->hasOneUse()) {
18154 SDNode *User = *Op.getNode()->use_begin();
18155 // Look for an unconditional branch following this conditional branch.
18156 // We need this because we need to reverse the successors in order
18157 // to implement FCMP_OEQ.
18158 if (User->getOpcode() == ISD::BR) {
18159 SDValue FalseBB = User->getOperand(1);
18161 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18162 assert(NewBR == User);
18166 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18167 Cond.getOperand(0), Cond.getOperand(1));
18168 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18169 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18170 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18171 Chain, Dest, CC, Cmp);
18172 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18177 } else if (Cond.getOpcode() == ISD::SETCC &&
18178 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18179 // For FCMP_UNE, we can emit
18180 // two branches instead of an explicit AND instruction with a
18181 // separate test. However, we only do this if this block doesn't
18182 // have a fall-through edge, because this requires an explicit
18183 // jmp when the condition is false.
18184 if (Op.getNode()->hasOneUse()) {
18185 SDNode *User = *Op.getNode()->use_begin();
18186 // Look for an unconditional branch following this conditional branch.
18187 // We need this because we need to reverse the successors in order
18188 // to implement FCMP_UNE.
18189 if (User->getOpcode() == ISD::BR) {
18190 SDValue FalseBB = User->getOperand(1);
18192 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18193 assert(NewBR == User);
18196 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18197 Cond.getOperand(0), Cond.getOperand(1));
18198 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18199 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18200 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18201 Chain, Dest, CC, Cmp);
18202 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18212 // Look pass the truncate if the high bits are known zero.
18213 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18214 Cond = Cond.getOperand(0);
18216 // We know the result is compared against zero. Try to match it to BT.
18217 if (Cond.hasOneUse()) {
18218 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18219 CC = NewSetCC.getOperand(0);
18220 Cond = NewSetCC.getOperand(1);
18227 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18228 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18229 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18231 Cond = ConvertCmpIfNecessary(Cond, DAG);
18232 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18233 Chain, Dest, CC, Cond);
18236 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18237 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18238 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18239 // that the guard pages used by the OS virtual memory manager are allocated in
18240 // correct sequence.
18242 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18243 SelectionDAG &DAG) const {
18244 MachineFunction &MF = DAG.getMachineFunction();
18245 bool SplitStack = MF.shouldSplitStack();
18246 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18251 SDNode *Node = Op.getNode();
18252 SDValue Chain = Op.getOperand(0);
18253 SDValue Size = Op.getOperand(1);
18254 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18255 EVT VT = Node->getValueType(0);
18257 // Chain the dynamic stack allocation so that it doesn't modify the stack
18258 // pointer when other instructions are using the stack.
18259 Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
18261 bool Is64Bit = Subtarget.is64Bit();
18262 MVT SPTy = getPointerTy(DAG.getDataLayout());
18266 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18267 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18268 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18269 " not tell us which reg is the stack pointer!");
18271 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18272 Chain = SP.getValue(1);
18273 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18274 unsigned StackAlign = TFI.getStackAlignment();
18275 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18276 if (Align > StackAlign)
18277 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18278 DAG.getConstant(-(uint64_t)Align, dl, VT));
18279 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18280 } else if (SplitStack) {
18281 MachineRegisterInfo &MRI = MF.getRegInfo();
18284 // The 64 bit implementation of segmented stacks needs to clobber both r10
18285 // r11. This makes it impossible to use it along with nested parameters.
18286 const Function *F = MF.getFunction();
18287 for (const auto &A : F->args()) {
18288 if (A.hasNestAttr())
18289 report_fatal_error("Cannot use segmented stacks with functions that "
18290 "have nested arguments.");
18294 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18295 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18296 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18297 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18298 DAG.getRegister(Vreg, SPTy));
18300 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18301 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18302 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18304 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18305 unsigned SPReg = RegInfo->getStackRegister();
18306 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18307 Chain = SP.getValue(1);
18310 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18311 DAG.getConstant(-(uint64_t)Align, dl, VT));
18312 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18318 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18319 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18321 SDValue Ops[2] = {Result, Chain};
18322 return DAG.getMergeValues(Ops, dl);
18325 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18326 MachineFunction &MF = DAG.getMachineFunction();
18327 auto PtrVT = getPointerTy(MF.getDataLayout());
18328 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18330 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18333 if (!Subtarget.is64Bit() ||
18334 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18335 // vastart just stores the address of the VarArgsFrameIndex slot into the
18336 // memory location argument.
18337 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18338 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18339 MachinePointerInfo(SV));
18343 // gp_offset (0 - 6 * 8)
18344 // fp_offset (48 - 48 + 8 * 16)
18345 // overflow_arg_area (point to parameters coming in memory).
18347 SmallVector<SDValue, 8> MemOps;
18348 SDValue FIN = Op.getOperand(1);
18350 SDValue Store = DAG.getStore(
18351 Op.getOperand(0), DL,
18352 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18353 MachinePointerInfo(SV));
18354 MemOps.push_back(Store);
18357 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18358 Store = DAG.getStore(
18359 Op.getOperand(0), DL,
18360 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18361 MachinePointerInfo(SV, 4));
18362 MemOps.push_back(Store);
18364 // Store ptr to overflow_arg_area
18365 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18366 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18368 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18369 MemOps.push_back(Store);
18371 // Store ptr to reg_save_area.
18372 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18373 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18374 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18375 Store = DAG.getStore(
18376 Op.getOperand(0), DL, RSFIN, FIN,
18377 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18378 MemOps.push_back(Store);
18379 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18382 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18383 assert(Subtarget.is64Bit() &&
18384 "LowerVAARG only handles 64-bit va_arg!");
18385 assert(Op.getNumOperands() == 4);
18387 MachineFunction &MF = DAG.getMachineFunction();
18388 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18389 // The Win64 ABI uses char* instead of a structure.
18390 return DAG.expandVAArg(Op.getNode());
18392 SDValue Chain = Op.getOperand(0);
18393 SDValue SrcPtr = Op.getOperand(1);
18394 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18395 unsigned Align = Op.getConstantOperandVal(3);
18398 EVT ArgVT = Op.getNode()->getValueType(0);
18399 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18400 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18403 // Decide which area this value should be read from.
18404 // TODO: Implement the AMD64 ABI in its entirety. This simple
18405 // selection mechanism works only for the basic types.
18406 if (ArgVT == MVT::f80) {
18407 llvm_unreachable("va_arg for f80 not yet implemented");
18408 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18409 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18410 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18411 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18413 llvm_unreachable("Unhandled argument type in LowerVAARG");
18416 if (ArgMode == 2) {
18417 // Sanity Check: Make sure using fp_offset makes sense.
18418 assert(!Subtarget.useSoftFloat() &&
18419 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18420 Subtarget.hasSSE1());
18423 // Insert VAARG_64 node into the DAG
18424 // VAARG_64 returns two values: Variable Argument Address, Chain
18425 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18426 DAG.getConstant(ArgMode, dl, MVT::i8),
18427 DAG.getConstant(Align, dl, MVT::i32)};
18428 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18429 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18430 VTs, InstOps, MVT::i64,
18431 MachinePointerInfo(SV),
18433 /*Volatile=*/false,
18435 /*WriteMem=*/true);
18436 Chain = VAARG.getValue(1);
18438 // Load the next argument and return it
18439 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18442 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18443 SelectionDAG &DAG) {
18444 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18445 // where a va_list is still an i8*.
18446 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18447 if (Subtarget.isCallingConvWin64(
18448 DAG.getMachineFunction().getFunction()->getCallingConv()))
18449 // Probably a Win64 va_copy.
18450 return DAG.expandVACopy(Op.getNode());
18452 SDValue Chain = Op.getOperand(0);
18453 SDValue DstPtr = Op.getOperand(1);
18454 SDValue SrcPtr = Op.getOperand(2);
18455 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
18456 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
18459 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
18460 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
18462 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
18465 /// Handle vector element shifts where the shift amount is a constant.
18466 /// Takes immediate version of shift as input.
18467 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
18468 SDValue SrcOp, uint64_t ShiftAmt,
18469 SelectionDAG &DAG) {
18470 MVT ElementType = VT.getVectorElementType();
18472 // Fold this packed shift into its first operand if ShiftAmt is 0.
18476 // Check for ShiftAmt >= element width
18477 if (ShiftAmt >= ElementType.getSizeInBits()) {
18478 if (Opc == X86ISD::VSRAI)
18479 ShiftAmt = ElementType.getSizeInBits() - 1;
18481 return DAG.getConstant(0, dl, VT);
18484 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
18485 && "Unknown target vector shift-by-constant node");
18487 // Fold this packed vector shift into a build vector if SrcOp is a
18488 // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
18489 if (VT == SrcOp.getSimpleValueType() &&
18490 ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
18491 SmallVector<SDValue, 8> Elts;
18492 unsigned NumElts = SrcOp->getNumOperands();
18493 ConstantSDNode *ND;
18496 default: llvm_unreachable("Unknown opcode!");
18497 case X86ISD::VSHLI:
18498 for (unsigned i=0; i!=NumElts; ++i) {
18499 SDValue CurrentOp = SrcOp->getOperand(i);
18500 if (CurrentOp->isUndef()) {
18501 Elts.push_back(CurrentOp);
18504 ND = cast<ConstantSDNode>(CurrentOp);
18505 const APInt &C = ND->getAPIntValue();
18506 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
18509 case X86ISD::VSRLI:
18510 for (unsigned i=0; i!=NumElts; ++i) {
18511 SDValue CurrentOp = SrcOp->getOperand(i);
18512 if (CurrentOp->isUndef()) {
18513 Elts.push_back(CurrentOp);
18516 ND = cast<ConstantSDNode>(CurrentOp);
18517 const APInt &C = ND->getAPIntValue();
18518 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
18521 case X86ISD::VSRAI:
18522 for (unsigned i=0; i!=NumElts; ++i) {
18523 SDValue CurrentOp = SrcOp->getOperand(i);
18524 if (CurrentOp->isUndef()) {
18525 Elts.push_back(CurrentOp);
18528 ND = cast<ConstantSDNode>(CurrentOp);
18529 const APInt &C = ND->getAPIntValue();
18530 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
18535 return DAG.getBuildVector(VT, dl, Elts);
18538 return DAG.getNode(Opc, dl, VT, SrcOp,
18539 DAG.getConstant(ShiftAmt, dl, MVT::i8));
18542 /// Handle vector element shifts where the shift amount may or may not be a
18543 /// constant. Takes immediate version of shift as input.
18544 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
18545 SDValue SrcOp, SDValue ShAmt,
18546 const X86Subtarget &Subtarget,
18547 SelectionDAG &DAG) {
18548 MVT SVT = ShAmt.getSimpleValueType();
18549 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
18551 // Catch shift-by-constant.
18552 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
18553 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
18554 CShAmt->getZExtValue(), DAG);
18556 // Change opcode to non-immediate version
18558 default: llvm_unreachable("Unknown target vector shift node");
18559 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
18560 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
18561 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
18564 // Need to build a vector containing shift amount.
18565 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
18566 // +=================+============+=======================================+
18567 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
18568 // +=================+============+=======================================+
18569 // | i64 | Yes, No | Use ShAmt as lowest elt |
18570 // | i32 | Yes | zero-extend in-reg |
18571 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
18572 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
18573 // +=================+============+=======================================+
18575 if (SVT == MVT::i64)
18576 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18577 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
18578 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
18579 ShAmt = ShAmt.getOperand(0);
18580 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
18581 ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18582 } else if (Subtarget.hasSSE41() &&
18583 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
18584 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
18585 ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
18587 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
18588 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
18589 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
18592 // The return type has to be a 128-bit type with the same element
18593 // type as the input type.
18594 MVT EltVT = VT.getVectorElementType();
18595 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
18597 ShAmt = DAG.getBitcast(ShVT, ShAmt);
18598 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
18601 /// \brief Return Mask with the necessary casting or extending
18602 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
18603 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
18604 const X86Subtarget &Subtarget, SelectionDAG &DAG,
18607 if (isAllOnesConstant(Mask))
18608 return DAG.getTargetConstant(1, dl, MaskVT);
18609 if (X86::isZeroNode(Mask))
18610 return DAG.getTargetConstant(0, dl, MaskVT);
18612 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
18613 // Mask should be extended
18614 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
18615 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
18618 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
18619 if (MaskVT == MVT::v64i1) {
18620 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
18621 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
18623 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18624 DAG.getConstant(0, dl, MVT::i32));
18625 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
18626 DAG.getConstant(1, dl, MVT::i32));
18628 Lo = DAG.getBitcast(MVT::v32i1, Lo);
18629 Hi = DAG.getBitcast(MVT::v32i1, Hi);
18631 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
18633 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
18635 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
18636 return DAG.getBitcast(MaskVT,
18637 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
18641 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
18642 Mask.getSimpleValueType().getSizeInBits());
18643 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
18644 // are extracted by EXTRACT_SUBVECTOR.
18645 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
18646 DAG.getBitcast(BitcastVT, Mask),
18647 DAG.getIntPtrConstant(0, dl));
18651 /// \brief Return (and \p Op, \p Mask) for compare instructions or
18652 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
18653 /// necessary casting or extending for \p Mask when lowering masking intrinsics
18654 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
18655 SDValue PreservedSrc,
18656 const X86Subtarget &Subtarget,
18657 SelectionDAG &DAG) {
18658 MVT VT = Op.getSimpleValueType();
18659 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
18660 unsigned OpcodeSelect = ISD::VSELECT;
18663 if (isAllOnesConstant(Mask))
18666 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
18668 switch (Op.getOpcode()) {
18670 case X86ISD::PCMPEQM:
18671 case X86ISD::PCMPGTM:
18673 case X86ISD::CMPMU:
18674 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
18675 case X86ISD::VFPCLASS:
18676 case X86ISD::VFPCLASSS:
18677 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
18678 case X86ISD::VTRUNC:
18679 case X86ISD::VTRUNCS:
18680 case X86ISD::VTRUNCUS:
18681 case X86ISD::CVTPS2PH:
18682 // We can't use ISD::VSELECT here because it is not always "Legal"
18683 // for the destination type. For example vpmovqb require only AVX512
18684 // and vselect that can operate on byte element type require BWI
18685 OpcodeSelect = X86ISD::SELECT;
18688 if (PreservedSrc.isUndef())
18689 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18690 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
18693 /// \brief Creates an SDNode for a predicated scalar operation.
18694 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
18695 /// The mask is coming as MVT::i8 and it should be truncated
18696 /// to MVT::i1 while lowering masking intrinsics.
18697 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
18698 /// "X86select" instead of "vselect". We just can't create the "vselect" node
18699 /// for a scalar instruction.
18700 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
18701 SDValue PreservedSrc,
18702 const X86Subtarget &Subtarget,
18703 SelectionDAG &DAG) {
18704 if (isAllOnesConstant(Mask))
18707 MVT VT = Op.getSimpleValueType();
18709 // The mask should be of type MVT::i1
18710 SDValue IMask = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Mask);
18712 if (Op.getOpcode() == X86ISD::FSETCCM ||
18713 Op.getOpcode() == X86ISD::FSETCCM_RND)
18714 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
18715 if (Op.getOpcode() == X86ISD::VFPCLASS ||
18716 Op.getOpcode() == X86ISD::VFPCLASSS)
18717 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
18719 if (PreservedSrc.isUndef())
18720 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
18721 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
18724 static int getSEHRegistrationNodeSize(const Function *Fn) {
18725 if (!Fn->hasPersonalityFn())
18726 report_fatal_error(
18727 "querying registration node size for function without personality");
18728 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
18729 // WinEHStatePass for the full struct definition.
18730 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
18731 case EHPersonality::MSVC_X86SEH: return 24;
18732 case EHPersonality::MSVC_CXX: return 16;
18735 report_fatal_error(
18736 "can only recover FP for 32-bit MSVC EH personality functions");
18739 /// When the MSVC runtime transfers control to us, either to an outlined
18740 /// function or when returning to a parent frame after catching an exception, we
18741 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
18742 /// Here's the math:
18743 /// RegNodeBase = EntryEBP - RegNodeSize
18744 /// ParentFP = RegNodeBase - ParentFrameOffset
18745 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
18746 /// subtracting the offset (negative on x86) takes us back to the parent FP.
18747 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
18748 SDValue EntryEBP) {
18749 MachineFunction &MF = DAG.getMachineFunction();
18752 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18753 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
18755 // It's possible that the parent function no longer has a personality function
18756 // if the exceptional code was optimized away, in which case we just return
18757 // the incoming EBP.
18758 if (!Fn->hasPersonalityFn())
18761 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
18762 // registration, or the .set_setframe offset.
18763 MCSymbol *OffsetSym =
18764 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
18765 GlobalValue::getRealLinkageName(Fn->getName()));
18766 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
18767 SDValue ParentFrameOffset =
18768 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
18770 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
18771 // prologue to RBP in the parent function.
18772 const X86Subtarget &Subtarget =
18773 static_cast<const X86Subtarget &>(DAG.getSubtarget());
18774 if (Subtarget.is64Bit())
18775 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
18777 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
18778 // RegNodeBase = EntryEBP - RegNodeSize
18779 // ParentFP = RegNodeBase - ParentFrameOffset
18780 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
18781 DAG.getConstant(RegNodeSize, dl, PtrVT));
18782 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
18785 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
18786 SelectionDAG &DAG) {
18787 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
18788 auto isRoundModeCurDirection = [](SDValue Rnd) {
18789 if (!isa<ConstantSDNode>(Rnd))
18792 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
18793 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
18797 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
18798 MVT VT = Op.getSimpleValueType();
18799 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
18801 switch(IntrData->Type) {
18802 case INTR_TYPE_1OP:
18803 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
18804 case INTR_TYPE_2OP:
18805 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18807 case INTR_TYPE_3OP:
18808 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18809 Op.getOperand(2), Op.getOperand(3));
18810 case INTR_TYPE_4OP:
18811 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
18812 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
18813 case INTR_TYPE_1OP_MASK_RM: {
18814 SDValue Src = Op.getOperand(1);
18815 SDValue PassThru = Op.getOperand(2);
18816 SDValue Mask = Op.getOperand(3);
18817 SDValue RoundingMode;
18818 // We always add rounding mode to the Node.
18819 // If the rounding mode is not specified, we add the
18820 // "current direction" mode.
18821 if (Op.getNumOperands() == 4)
18823 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18825 RoundingMode = Op.getOperand(4);
18826 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
18827 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
18829 Mask, PassThru, Subtarget, DAG);
18831 case INTR_TYPE_1OP_MASK: {
18832 SDValue Src = Op.getOperand(1);
18833 SDValue PassThru = Op.getOperand(2);
18834 SDValue Mask = Op.getOperand(3);
18835 // We add rounding mode to the Node when
18836 // - RM Opcode is specified and
18837 // - RM is not "current direction".
18838 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18839 if (IntrWithRoundingModeOpcode != 0) {
18840 SDValue Rnd = Op.getOperand(4);
18841 if (!isRoundModeCurDirection(Rnd)) {
18842 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18843 dl, Op.getValueType(),
18845 Mask, PassThru, Subtarget, DAG);
18848 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
18849 Mask, PassThru, Subtarget, DAG);
18851 case INTR_TYPE_SCALAR_MASK: {
18852 SDValue Src1 = Op.getOperand(1);
18853 SDValue Src2 = Op.getOperand(2);
18854 SDValue passThru = Op.getOperand(3);
18855 SDValue Mask = Op.getOperand(4);
18856 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
18857 Mask, passThru, Subtarget, DAG);
18859 case INTR_TYPE_SCALAR_MASK_RM: {
18860 SDValue Src1 = Op.getOperand(1);
18861 SDValue Src2 = Op.getOperand(2);
18862 SDValue Src0 = Op.getOperand(3);
18863 SDValue Mask = Op.getOperand(4);
18864 // There are 2 kinds of intrinsics in this group:
18865 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
18866 // (2) With rounding mode and sae - 7 operands.
18867 if (Op.getNumOperands() == 6) {
18868 SDValue Sae = Op.getOperand(5);
18869 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18871 Mask, Src0, Subtarget, DAG);
18873 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
18874 SDValue RoundingMode = Op.getOperand(5);
18875 SDValue Sae = Op.getOperand(6);
18876 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
18877 RoundingMode, Sae),
18878 Mask, Src0, Subtarget, DAG);
18880 case INTR_TYPE_2OP_MASK:
18881 case INTR_TYPE_2OP_IMM8_MASK: {
18882 SDValue Src1 = Op.getOperand(1);
18883 SDValue Src2 = Op.getOperand(2);
18884 SDValue PassThru = Op.getOperand(3);
18885 SDValue Mask = Op.getOperand(4);
18887 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
18888 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
18890 // We specify 2 possible opcodes for intrinsics with rounding modes.
18891 // First, we check if the intrinsic may have non-default rounding mode,
18892 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18893 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18894 if (IntrWithRoundingModeOpcode != 0) {
18895 SDValue Rnd = Op.getOperand(5);
18896 if (!isRoundModeCurDirection(Rnd)) {
18897 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18898 dl, Op.getValueType(),
18900 Mask, PassThru, Subtarget, DAG);
18903 // TODO: Intrinsics should have fast-math-flags to propagate.
18904 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
18905 Mask, PassThru, Subtarget, DAG);
18907 case INTR_TYPE_2OP_MASK_RM: {
18908 SDValue Src1 = Op.getOperand(1);
18909 SDValue Src2 = Op.getOperand(2);
18910 SDValue PassThru = Op.getOperand(3);
18911 SDValue Mask = Op.getOperand(4);
18912 // We specify 2 possible modes for intrinsics, with/without rounding
18914 // First, we check if the intrinsic have rounding mode (6 operands),
18915 // if not, we set rounding mode to "current".
18917 if (Op.getNumOperands() == 6)
18918 Rnd = Op.getOperand(5);
18920 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18921 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18923 Mask, PassThru, Subtarget, DAG);
18925 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
18926 SDValue Src1 = Op.getOperand(1);
18927 SDValue Src2 = Op.getOperand(2);
18928 SDValue Src3 = Op.getOperand(3);
18929 SDValue PassThru = Op.getOperand(4);
18930 SDValue Mask = Op.getOperand(5);
18931 SDValue Sae = Op.getOperand(6);
18933 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
18935 Mask, PassThru, Subtarget, DAG);
18937 case INTR_TYPE_3OP_MASK_RM: {
18938 SDValue Src1 = Op.getOperand(1);
18939 SDValue Src2 = Op.getOperand(2);
18940 SDValue Imm = Op.getOperand(3);
18941 SDValue PassThru = Op.getOperand(4);
18942 SDValue Mask = Op.getOperand(5);
18943 // We specify 2 possible modes for intrinsics, with/without rounding
18945 // First, we check if the intrinsic have rounding mode (7 operands),
18946 // if not, we set rounding mode to "current".
18948 if (Op.getNumOperands() == 7)
18949 Rnd = Op.getOperand(6);
18951 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
18952 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18953 Src1, Src2, Imm, Rnd),
18954 Mask, PassThru, Subtarget, DAG);
18956 case INTR_TYPE_3OP_IMM8_MASK:
18957 case INTR_TYPE_3OP_MASK: {
18958 SDValue Src1 = Op.getOperand(1);
18959 SDValue Src2 = Op.getOperand(2);
18960 SDValue Src3 = Op.getOperand(3);
18961 SDValue PassThru = Op.getOperand(4);
18962 SDValue Mask = Op.getOperand(5);
18964 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
18965 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
18967 // We specify 2 possible opcodes for intrinsics with rounding modes.
18968 // First, we check if the intrinsic may have non-default rounding mode,
18969 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
18970 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
18971 if (IntrWithRoundingModeOpcode != 0) {
18972 SDValue Rnd = Op.getOperand(6);
18973 if (!isRoundModeCurDirection(Rnd)) {
18974 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
18975 dl, Op.getValueType(),
18976 Src1, Src2, Src3, Rnd),
18977 Mask, PassThru, Subtarget, DAG);
18980 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
18982 Mask, PassThru, Subtarget, DAG);
18984 case VPERM_2OP_MASK : {
18985 SDValue Src1 = Op.getOperand(1);
18986 SDValue Src2 = Op.getOperand(2);
18987 SDValue PassThru = Op.getOperand(3);
18988 SDValue Mask = Op.getOperand(4);
18990 // Swap Src1 and Src2 in the node creation
18991 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
18992 Mask, PassThru, Subtarget, DAG);
18994 case VPERM_3OP_MASKZ:
18995 case VPERM_3OP_MASK:{
18996 MVT VT = Op.getSimpleValueType();
18997 // Src2 is the PassThru
18998 SDValue Src1 = Op.getOperand(1);
18999 // PassThru needs to be the same type as the destination in order
19000 // to pattern match correctly.
19001 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19002 SDValue Src3 = Op.getOperand(3);
19003 SDValue Mask = Op.getOperand(4);
19004 SDValue PassThru = SDValue();
19006 // set PassThru element
19007 if (IntrData->Type == VPERM_3OP_MASKZ)
19008 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19012 // Swap Src1 and Src2 in the node creation
19013 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19014 dl, Op.getValueType(),
19016 Mask, PassThru, Subtarget, DAG);
19020 case FMA_OP_MASK: {
19021 SDValue Src1 = Op.getOperand(1);
19022 SDValue Src2 = Op.getOperand(2);
19023 SDValue Src3 = Op.getOperand(3);
19024 SDValue Mask = Op.getOperand(4);
19025 MVT VT = Op.getSimpleValueType();
19026 SDValue PassThru = SDValue();
19028 // set PassThru element
19029 if (IntrData->Type == FMA_OP_MASKZ)
19030 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19031 else if (IntrData->Type == FMA_OP_MASK3)
19036 // We specify 2 possible opcodes for intrinsics with rounding modes.
19037 // First, we check if the intrinsic may have non-default rounding mode,
19038 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19039 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19040 if (IntrWithRoundingModeOpcode != 0) {
19041 SDValue Rnd = Op.getOperand(5);
19042 if (!isRoundModeCurDirection(Rnd))
19043 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19044 dl, Op.getValueType(),
19045 Src1, Src2, Src3, Rnd),
19046 Mask, PassThru, Subtarget, DAG);
19048 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19049 dl, Op.getValueType(),
19051 Mask, PassThru, Subtarget, DAG);
19053 case FMA_OP_SCALAR_MASK:
19054 case FMA_OP_SCALAR_MASK3:
19055 case FMA_OP_SCALAR_MASKZ: {
19056 SDValue Src1 = Op.getOperand(1);
19057 SDValue Src2 = Op.getOperand(2);
19058 SDValue Src3 = Op.getOperand(3);
19059 SDValue Mask = Op.getOperand(4);
19060 MVT VT = Op.getSimpleValueType();
19061 SDValue PassThru = SDValue();
19063 // set PassThru element
19064 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19065 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19066 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19071 SDValue Rnd = Op.getOperand(5);
19072 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19073 Op.getValueType(), Src1, Src2,
19075 Mask, PassThru, Subtarget, DAG);
19077 case TERLOG_OP_MASK:
19078 case TERLOG_OP_MASKZ: {
19079 SDValue Src1 = Op.getOperand(1);
19080 SDValue Src2 = Op.getOperand(2);
19081 SDValue Src3 = Op.getOperand(3);
19082 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19083 SDValue Mask = Op.getOperand(5);
19084 MVT VT = Op.getSimpleValueType();
19085 SDValue PassThru = Src1;
19086 // Set PassThru element.
19087 if (IntrData->Type == TERLOG_OP_MASKZ)
19088 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19090 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19091 Src1, Src2, Src3, Src4),
19092 Mask, PassThru, Subtarget, DAG);
19095 // ISD::FP_ROUND has a second argument that indicates if the truncation
19096 // does not change the value. Set it to 0 since it can change.
19097 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19098 DAG.getIntPtrConstant(0, dl));
19099 case CVTPD2PS_MASK: {
19100 SDValue Src = Op.getOperand(1);
19101 SDValue PassThru = Op.getOperand(2);
19102 SDValue Mask = Op.getOperand(3);
19103 // We add rounding mode to the Node when
19104 // - RM Opcode is specified and
19105 // - RM is not "current direction".
19106 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19107 if (IntrWithRoundingModeOpcode != 0) {
19108 SDValue Rnd = Op.getOperand(4);
19109 if (!isRoundModeCurDirection(Rnd)) {
19110 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19111 dl, Op.getValueType(),
19113 Mask, PassThru, Subtarget, DAG);
19116 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19117 // ISD::FP_ROUND has a second argument that indicates if the truncation
19118 // does not change the value. Set it to 0 since it can change.
19119 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19120 DAG.getIntPtrConstant(0, dl)),
19121 Mask, PassThru, Subtarget, DAG);
19124 // FPclass intrinsics with mask
19125 SDValue Src1 = Op.getOperand(1);
19126 MVT VT = Src1.getSimpleValueType();
19127 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19128 SDValue Imm = Op.getOperand(2);
19129 SDValue Mask = Op.getOperand(3);
19130 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19131 Mask.getSimpleValueType().getSizeInBits());
19132 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19133 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19134 DAG.getTargetConstant(0, dl, MaskVT),
19136 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19137 DAG.getUNDEF(BitcastVT), FPclassMask,
19138 DAG.getIntPtrConstant(0, dl));
19139 return DAG.getBitcast(Op.getValueType(), Res);
19142 SDValue Src1 = Op.getOperand(1);
19143 SDValue Imm = Op.getOperand(2);
19144 SDValue Mask = Op.getOperand(3);
19145 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Imm);
19146 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19147 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19148 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, FPclassMask);
19151 case CMP_MASK_CC: {
19152 // Comparison intrinsics with masks.
19153 // Example of transformation:
19154 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19155 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19157 // (v8i1 (insert_subvector undef,
19158 // (v2i1 (and (PCMPEQM %a, %b),
19159 // (extract_subvector
19160 // (v8i1 (bitcast %mask)), 0))), 0))))
19161 MVT VT = Op.getOperand(1).getSimpleValueType();
19162 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19163 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19164 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19165 Mask.getSimpleValueType().getSizeInBits());
19167 if (IntrData->Type == CMP_MASK_CC) {
19168 SDValue CC = Op.getOperand(3);
19169 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19170 // We specify 2 possible opcodes for intrinsics with rounding modes.
19171 // First, we check if the intrinsic may have non-default rounding mode,
19172 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19173 if (IntrData->Opc1 != 0) {
19174 SDValue Rnd = Op.getOperand(5);
19175 if (!isRoundModeCurDirection(Rnd))
19176 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19177 Op.getOperand(2), CC, Rnd);
19179 //default rounding mode
19181 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19182 Op.getOperand(2), CC);
19185 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19186 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19189 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19190 DAG.getTargetConstant(0, dl,
19193 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19194 DAG.getUNDEF(BitcastVT), CmpMask,
19195 DAG.getIntPtrConstant(0, dl));
19196 return DAG.getBitcast(Op.getValueType(), Res);
19198 case CMP_MASK_SCALAR_CC: {
19199 SDValue Src1 = Op.getOperand(1);
19200 SDValue Src2 = Op.getOperand(2);
19201 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19202 SDValue Mask = Op.getOperand(4);
19205 if (IntrData->Opc1 != 0) {
19206 SDValue Rnd = Op.getOperand(5);
19207 if (!isRoundModeCurDirection(Rnd))
19208 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::i1, Src1, Src2, CC, Rnd);
19210 //default rounding mode
19212 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::i1, Src1, Src2, CC);
19214 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19215 DAG.getTargetConstant(0, dl,
19219 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, CmpMask);
19221 case COMI: { // Comparison intrinsics
19222 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19223 SDValue LHS = Op.getOperand(1);
19224 SDValue RHS = Op.getOperand(2);
19225 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19226 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19229 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19230 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19231 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19232 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19235 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19236 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19237 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19238 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19241 case ISD::SETGT: // (CF = 0 and ZF = 0)
19242 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19244 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19245 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19248 case ISD::SETGE: // CF = 0
19249 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19251 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19252 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19255 llvm_unreachable("Unexpected illegal condition!");
19257 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19259 case COMI_RM: { // Comparison intrinsics with Sae
19260 SDValue LHS = Op.getOperand(1);
19261 SDValue RHS = Op.getOperand(2);
19262 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19263 SDValue Sae = Op.getOperand(4);
19266 if (isRoundModeCurDirection(Sae))
19267 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::i1, LHS, RHS,
19268 DAG.getConstant(CondVal, dl, MVT::i8));
19270 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::i1, LHS, RHS,
19271 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19272 // AnyExt just uses KMOVW %kreg, %r32; ZeroExt emits "and $1, %reg"
19273 return DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, FCmp);
19276 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19277 Op.getOperand(1), Op.getOperand(2), Subtarget,
19279 case COMPRESS_EXPAND_IN_REG: {
19280 SDValue Mask = Op.getOperand(3);
19281 SDValue DataToCompress = Op.getOperand(1);
19282 SDValue PassThru = Op.getOperand(2);
19283 if (isAllOnesConstant(Mask)) // return data as is
19284 return Op.getOperand(1);
19286 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19288 Mask, PassThru, Subtarget, DAG);
19291 SDValue Mask = Op.getOperand(1);
19292 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19293 Mask.getSimpleValueType().getSizeInBits());
19294 Mask = DAG.getBitcast(MaskVT, Mask);
19295 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19298 MVT VT = Op.getSimpleValueType();
19299 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19301 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19302 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19303 // Arguments should be swapped.
19304 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19305 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19307 return DAG.getBitcast(VT, Res);
19310 case FIXUPIMMS_MASKZ:
19312 case FIXUPIMM_MASKZ:{
19313 SDValue Src1 = Op.getOperand(1);
19314 SDValue Src2 = Op.getOperand(2);
19315 SDValue Src3 = Op.getOperand(3);
19316 SDValue Imm = Op.getOperand(4);
19317 SDValue Mask = Op.getOperand(5);
19318 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19319 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19320 // We specify 2 possible modes for intrinsics, with/without rounding
19322 // First, we check if the intrinsic have rounding mode (7 operands),
19323 // if not, we set rounding mode to "current".
19325 if (Op.getNumOperands() == 7)
19326 Rnd = Op.getOperand(6);
19328 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19329 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19330 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19331 Src1, Src2, Src3, Imm, Rnd),
19332 Mask, Passthru, Subtarget, DAG);
19333 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19334 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19335 Src1, Src2, Src3, Imm, Rnd),
19336 Mask, Passthru, Subtarget, DAG);
19338 case CONVERT_TO_MASK: {
19339 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19340 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19341 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19343 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19345 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19346 DAG.getUNDEF(BitcastVT), CvtMask,
19347 DAG.getIntPtrConstant(0, dl));
19348 return DAG.getBitcast(Op.getValueType(), Res);
19350 case CONVERT_MASK_TO_VEC: {
19351 SDValue Mask = Op.getOperand(1);
19352 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19353 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19354 return DAG.getNode(IntrData->Opc0, dl, VT, VMask);
19356 case BRCST_SUBVEC_TO_VEC: {
19357 SDValue Src = Op.getOperand(1);
19358 SDValue Passthru = Op.getOperand(2);
19359 SDValue Mask = Op.getOperand(3);
19360 EVT resVT = Passthru.getValueType();
19361 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19362 DAG.getUNDEF(resVT), Src,
19363 DAG.getIntPtrConstant(0, dl));
19365 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19366 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19368 immVal = DAG.getConstant(0, dl, MVT::i8);
19369 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19370 subVec, subVec, immVal),
19371 Mask, Passthru, Subtarget, DAG);
19373 case BRCST32x2_TO_VEC: {
19374 SDValue Src = Op.getOperand(1);
19375 SDValue PassThru = Op.getOperand(2);
19376 SDValue Mask = Op.getOperand(3);
19378 assert((VT.getScalarType() == MVT::i32 ||
19379 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19380 //bitcast Src to packed 64
19381 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19382 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19383 Src = DAG.getBitcast(BitcastVT, Src);
19385 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19386 Mask, PassThru, Subtarget, DAG);
19394 default: return SDValue(); // Don't custom lower most intrinsics.
19396 case Intrinsic::x86_avx2_permd:
19397 case Intrinsic::x86_avx2_permps:
19398 // Operands intentionally swapped. Mask is last operand to intrinsic,
19399 // but second operand for node/instruction.
19400 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19401 Op.getOperand(2), Op.getOperand(1));
19403 // ptest and testp intrinsics. The intrinsic these come from are designed to
19404 // return an integer value, not just an instruction so lower it to the ptest
19405 // or testp pattern and a setcc for the result.
19406 case Intrinsic::x86_sse41_ptestz:
19407 case Intrinsic::x86_sse41_ptestc:
19408 case Intrinsic::x86_sse41_ptestnzc:
19409 case Intrinsic::x86_avx_ptestz_256:
19410 case Intrinsic::x86_avx_ptestc_256:
19411 case Intrinsic::x86_avx_ptestnzc_256:
19412 case Intrinsic::x86_avx_vtestz_ps:
19413 case Intrinsic::x86_avx_vtestc_ps:
19414 case Intrinsic::x86_avx_vtestnzc_ps:
19415 case Intrinsic::x86_avx_vtestz_pd:
19416 case Intrinsic::x86_avx_vtestc_pd:
19417 case Intrinsic::x86_avx_vtestnzc_pd:
19418 case Intrinsic::x86_avx_vtestz_ps_256:
19419 case Intrinsic::x86_avx_vtestc_ps_256:
19420 case Intrinsic::x86_avx_vtestnzc_ps_256:
19421 case Intrinsic::x86_avx_vtestz_pd_256:
19422 case Intrinsic::x86_avx_vtestc_pd_256:
19423 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19424 bool IsTestPacked = false;
19425 X86::CondCode X86CC;
19427 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19428 case Intrinsic::x86_avx_vtestz_ps:
19429 case Intrinsic::x86_avx_vtestz_pd:
19430 case Intrinsic::x86_avx_vtestz_ps_256:
19431 case Intrinsic::x86_avx_vtestz_pd_256:
19432 IsTestPacked = true;
19434 case Intrinsic::x86_sse41_ptestz:
19435 case Intrinsic::x86_avx_ptestz_256:
19437 X86CC = X86::COND_E;
19439 case Intrinsic::x86_avx_vtestc_ps:
19440 case Intrinsic::x86_avx_vtestc_pd:
19441 case Intrinsic::x86_avx_vtestc_ps_256:
19442 case Intrinsic::x86_avx_vtestc_pd_256:
19443 IsTestPacked = true;
19445 case Intrinsic::x86_sse41_ptestc:
19446 case Intrinsic::x86_avx_ptestc_256:
19448 X86CC = X86::COND_B;
19450 case Intrinsic::x86_avx_vtestnzc_ps:
19451 case Intrinsic::x86_avx_vtestnzc_pd:
19452 case Intrinsic::x86_avx_vtestnzc_ps_256:
19453 case Intrinsic::x86_avx_vtestnzc_pd_256:
19454 IsTestPacked = true;
19456 case Intrinsic::x86_sse41_ptestnzc:
19457 case Intrinsic::x86_avx_ptestnzc_256:
19459 X86CC = X86::COND_A;
19463 SDValue LHS = Op.getOperand(1);
19464 SDValue RHS = Op.getOperand(2);
19465 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
19466 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
19467 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19468 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19470 case Intrinsic::x86_avx512_kortestz_w:
19471 case Intrinsic::x86_avx512_kortestc_w: {
19472 X86::CondCode X86CC =
19473 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
19474 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
19475 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
19476 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
19477 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
19478 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19481 case Intrinsic::x86_sse42_pcmpistria128:
19482 case Intrinsic::x86_sse42_pcmpestria128:
19483 case Intrinsic::x86_sse42_pcmpistric128:
19484 case Intrinsic::x86_sse42_pcmpestric128:
19485 case Intrinsic::x86_sse42_pcmpistrio128:
19486 case Intrinsic::x86_sse42_pcmpestrio128:
19487 case Intrinsic::x86_sse42_pcmpistris128:
19488 case Intrinsic::x86_sse42_pcmpestris128:
19489 case Intrinsic::x86_sse42_pcmpistriz128:
19490 case Intrinsic::x86_sse42_pcmpestriz128: {
19492 X86::CondCode X86CC;
19494 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
19495 case Intrinsic::x86_sse42_pcmpistria128:
19496 Opcode = X86ISD::PCMPISTRI;
19497 X86CC = X86::COND_A;
19499 case Intrinsic::x86_sse42_pcmpestria128:
19500 Opcode = X86ISD::PCMPESTRI;
19501 X86CC = X86::COND_A;
19503 case Intrinsic::x86_sse42_pcmpistric128:
19504 Opcode = X86ISD::PCMPISTRI;
19505 X86CC = X86::COND_B;
19507 case Intrinsic::x86_sse42_pcmpestric128:
19508 Opcode = X86ISD::PCMPESTRI;
19509 X86CC = X86::COND_B;
19511 case Intrinsic::x86_sse42_pcmpistrio128:
19512 Opcode = X86ISD::PCMPISTRI;
19513 X86CC = X86::COND_O;
19515 case Intrinsic::x86_sse42_pcmpestrio128:
19516 Opcode = X86ISD::PCMPESTRI;
19517 X86CC = X86::COND_O;
19519 case Intrinsic::x86_sse42_pcmpistris128:
19520 Opcode = X86ISD::PCMPISTRI;
19521 X86CC = X86::COND_S;
19523 case Intrinsic::x86_sse42_pcmpestris128:
19524 Opcode = X86ISD::PCMPESTRI;
19525 X86CC = X86::COND_S;
19527 case Intrinsic::x86_sse42_pcmpistriz128:
19528 Opcode = X86ISD::PCMPISTRI;
19529 X86CC = X86::COND_E;
19531 case Intrinsic::x86_sse42_pcmpestriz128:
19532 Opcode = X86ISD::PCMPESTRI;
19533 X86CC = X86::COND_E;
19536 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19537 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19538 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
19539 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
19540 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19543 case Intrinsic::x86_sse42_pcmpistri128:
19544 case Intrinsic::x86_sse42_pcmpestri128: {
19546 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
19547 Opcode = X86ISD::PCMPISTRI;
19549 Opcode = X86ISD::PCMPESTRI;
19551 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
19552 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
19553 return DAG.getNode(Opcode, dl, VTs, NewOps);
19556 case Intrinsic::eh_sjlj_lsda: {
19557 MachineFunction &MF = DAG.getMachineFunction();
19558 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19559 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19560 auto &Context = MF.getMMI().getContext();
19561 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
19562 Twine(MF.getFunctionNumber()));
19563 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
19566 case Intrinsic::x86_seh_lsda: {
19567 // Compute the symbol for the LSDA. We know it'll get emitted later.
19568 MachineFunction &MF = DAG.getMachineFunction();
19569 SDValue Op1 = Op.getOperand(1);
19570 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
19571 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
19572 GlobalValue::getRealLinkageName(Fn->getName()));
19574 // Generate a simple absolute symbol reference. This intrinsic is only
19575 // supported on 32-bit Windows, which isn't PIC.
19576 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
19577 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
19580 case Intrinsic::x86_seh_recoverfp: {
19581 SDValue FnOp = Op.getOperand(1);
19582 SDValue IncomingFPOp = Op.getOperand(2);
19583 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
19584 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
19586 report_fatal_error(
19587 "llvm.x86.seh.recoverfp must take a function as the first argument");
19588 return recoverFramePointer(DAG, Fn, IncomingFPOp);
19591 case Intrinsic::localaddress: {
19592 // Returns one of the stack, base, or frame pointer registers, depending on
19593 // which is used to reference local variables.
19594 MachineFunction &MF = DAG.getMachineFunction();
19595 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
19597 if (RegInfo->hasBasePointer(MF))
19598 Reg = RegInfo->getBaseRegister();
19599 else // This function handles the SP or FP case.
19600 Reg = RegInfo->getPtrSizedFrameRegister(MF);
19601 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
19606 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19607 SDValue Src, SDValue Mask, SDValue Base,
19608 SDValue Index, SDValue ScaleOp, SDValue Chain,
19609 const X86Subtarget &Subtarget) {
19611 auto *C = cast<ConstantSDNode>(ScaleOp);
19612 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19613 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19614 Index.getSimpleValueType().getVectorNumElements());
19616 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19617 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
19618 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19619 SDValue Segment = DAG.getRegister(0, MVT::i32);
19621 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
19622 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
19623 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19624 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
19625 return DAG.getMergeValues(RetOps, dl);
19628 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19629 SDValue Src, SDValue Mask, SDValue Base,
19630 SDValue Index, SDValue ScaleOp, SDValue Chain,
19631 const X86Subtarget &Subtarget) {
19633 auto *C = cast<ConstantSDNode>(ScaleOp);
19634 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19635 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19636 SDValue Segment = DAG.getRegister(0, MVT::i32);
19637 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19638 Index.getSimpleValueType().getVectorNumElements());
19640 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19641 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
19642 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
19643 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
19644 return SDValue(Res, 1);
19647 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
19648 SDValue Mask, SDValue Base, SDValue Index,
19649 SDValue ScaleOp, SDValue Chain,
19650 const X86Subtarget &Subtarget) {
19652 auto *C = cast<ConstantSDNode>(ScaleOp);
19653 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
19654 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
19655 SDValue Segment = DAG.getRegister(0, MVT::i32);
19657 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
19658 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19659 //SDVTList VTs = DAG.getVTList(MVT::Other);
19660 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
19661 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
19662 return SDValue(Res, 0);
19665 /// Handles the lowering of builtin intrinsic that return the value
19666 /// of the extended control register.
19667 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
19669 const X86Subtarget &Subtarget,
19670 SmallVectorImpl<SDValue> &Results) {
19671 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19672 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19675 // The ECX register is used to select the index of the XCR register to
19678 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
19679 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
19680 Chain = SDValue(N1, 0);
19682 // Reads the content of XCR and returns it in registers EDX:EAX.
19683 if (Subtarget.is64Bit()) {
19684 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
19685 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19688 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
19689 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19692 Chain = HI.getValue(1);
19694 if (Subtarget.is64Bit()) {
19695 // Merge the two 32-bit values into a 64-bit one..
19696 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19697 DAG.getConstant(32, DL, MVT::i8));
19698 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19699 Results.push_back(Chain);
19703 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19704 SDValue Ops[] = { LO, HI };
19705 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19706 Results.push_back(Pair);
19707 Results.push_back(Chain);
19710 /// Handles the lowering of builtin intrinsics that read performance monitor
19711 /// counters (x86_rdpmc).
19712 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
19714 const X86Subtarget &Subtarget,
19715 SmallVectorImpl<SDValue> &Results) {
19716 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19717 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19720 // The ECX register is used to select the index of the performance counter
19722 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
19724 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
19726 // Reads the content of a 64-bit performance counter and returns it in the
19727 // registers EDX:EAX.
19728 if (Subtarget.is64Bit()) {
19729 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19730 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19733 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19734 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19737 Chain = HI.getValue(1);
19739 if (Subtarget.is64Bit()) {
19740 // The EAX register is loaded with the low-order 32 bits. The EDX register
19741 // is loaded with the supported high-order bits of the counter.
19742 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19743 DAG.getConstant(32, DL, MVT::i8));
19744 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19745 Results.push_back(Chain);
19749 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19750 SDValue Ops[] = { LO, HI };
19751 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19752 Results.push_back(Pair);
19753 Results.push_back(Chain);
19756 /// Handles the lowering of builtin intrinsics that read the time stamp counter
19757 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
19758 /// READCYCLECOUNTER nodes.
19759 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
19761 const X86Subtarget &Subtarget,
19762 SmallVectorImpl<SDValue> &Results) {
19763 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
19764 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
19767 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
19768 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
19769 // and the EAX register is loaded with the low-order 32 bits.
19770 if (Subtarget.is64Bit()) {
19771 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
19772 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
19775 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
19776 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
19779 SDValue Chain = HI.getValue(1);
19781 if (Opcode == X86ISD::RDTSCP_DAG) {
19782 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
19784 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
19785 // the ECX register. Add 'ecx' explicitly to the chain.
19786 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
19788 // Explicitly store the content of ECX at the location passed in input
19789 // to the 'rdtscp' intrinsic.
19790 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
19791 MachinePointerInfo());
19794 if (Subtarget.is64Bit()) {
19795 // The EDX register is loaded with the high-order 32 bits of the MSR, and
19796 // the EAX register is loaded with the low-order 32 bits.
19797 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
19798 DAG.getConstant(32, DL, MVT::i8));
19799 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
19800 Results.push_back(Chain);
19804 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
19805 SDValue Ops[] = { LO, HI };
19806 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
19807 Results.push_back(Pair);
19808 Results.push_back(Chain);
19811 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
19812 SelectionDAG &DAG) {
19813 SmallVector<SDValue, 2> Results;
19815 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
19817 return DAG.getMergeValues(Results, DL);
19820 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
19821 MachineFunction &MF = DAG.getMachineFunction();
19822 SDValue Chain = Op.getOperand(0);
19823 SDValue RegNode = Op.getOperand(2);
19824 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19826 report_fatal_error("EH registrations only live in functions using WinEH");
19828 // Cast the operand to an alloca, and remember the frame index.
19829 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
19831 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
19832 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
19834 // Return the chain operand without making any DAG nodes.
19838 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
19839 MachineFunction &MF = DAG.getMachineFunction();
19840 SDValue Chain = Op.getOperand(0);
19841 SDValue EHGuard = Op.getOperand(2);
19842 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
19844 report_fatal_error("EHGuard only live in functions using WinEH");
19846 // Cast the operand to an alloca, and remember the frame index.
19847 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
19849 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
19850 EHInfo->EHGuardFrameIndex = FINode->getIndex();
19852 // Return the chain operand without making any DAG nodes.
19856 /// Emit Truncating Store with signed or unsigned saturation.
19858 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
19859 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
19860 SelectionDAG &DAG) {
19862 SDVTList VTs = DAG.getVTList(MVT::Other);
19863 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
19864 SDValue Ops[] = { Chain, Val, Ptr, Undef };
19866 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19867 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19870 /// Emit Masked Truncating Store with signed or unsigned saturation.
19872 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
19873 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
19874 MachineMemOperand *MMO, SelectionDAG &DAG) {
19876 SDVTList VTs = DAG.getVTList(MVT::Other);
19877 SDValue Ops[] = { Chain, Ptr, Mask, Val };
19879 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
19880 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
19883 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19884 SelectionDAG &DAG) {
19885 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
19887 const IntrinsicData* IntrData = getIntrinsicWithChain(IntNo);
19889 if (IntNo == llvm::Intrinsic::x86_seh_ehregnode)
19890 return MarkEHRegistrationNode(Op, DAG);
19891 if (IntNo == llvm::Intrinsic::x86_seh_ehguard)
19892 return MarkEHGuard(Op, DAG);
19893 if (IntNo == llvm::Intrinsic::x86_flags_read_u32 ||
19894 IntNo == llvm::Intrinsic::x86_flags_read_u64 ||
19895 IntNo == llvm::Intrinsic::x86_flags_write_u32 ||
19896 IntNo == llvm::Intrinsic::x86_flags_write_u64) {
19897 // We need a frame pointer because this will get lowered to a PUSH/POP
19899 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
19900 MFI.setHasCopyImplyingStackAdjustment(true);
19901 // Don't do anything here, we will expand these intrinsics out later
19902 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
19909 switch(IntrData->Type) {
19910 default: llvm_unreachable("Unknown Intrinsic Type");
19913 // Emit the node with the right value type.
19914 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
19915 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19917 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
19918 // Otherwise return the value from Rand, which is always 0, casted to i32.
19919 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
19920 DAG.getConstant(1, dl, Op->getValueType(1)),
19921 DAG.getConstant(X86::COND_B, dl, MVT::i32),
19922 SDValue(Result.getNode(), 1) };
19923 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
19924 DAG.getVTList(Op->getValueType(1), MVT::Glue),
19927 // Return { result, isValid, chain }.
19928 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
19929 SDValue(Result.getNode(), 2));
19932 //gather(v1, mask, index, base, scale);
19933 SDValue Chain = Op.getOperand(0);
19934 SDValue Src = Op.getOperand(2);
19935 SDValue Base = Op.getOperand(3);
19936 SDValue Index = Op.getOperand(4);
19937 SDValue Mask = Op.getOperand(5);
19938 SDValue Scale = Op.getOperand(6);
19939 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
19943 //scatter(base, mask, index, v1, scale);
19944 SDValue Chain = Op.getOperand(0);
19945 SDValue Base = Op.getOperand(2);
19946 SDValue Mask = Op.getOperand(3);
19947 SDValue Index = Op.getOperand(4);
19948 SDValue Src = Op.getOperand(5);
19949 SDValue Scale = Op.getOperand(6);
19950 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
19951 Scale, Chain, Subtarget);
19954 SDValue Hint = Op.getOperand(6);
19955 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
19956 assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
19957 unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
19958 SDValue Chain = Op.getOperand(0);
19959 SDValue Mask = Op.getOperand(2);
19960 SDValue Index = Op.getOperand(3);
19961 SDValue Base = Op.getOperand(4);
19962 SDValue Scale = Op.getOperand(5);
19963 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
19966 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
19968 SmallVector<SDValue, 2> Results;
19969 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
19971 return DAG.getMergeValues(Results, dl);
19973 // Read Performance Monitoring Counters.
19975 SmallVector<SDValue, 2> Results;
19976 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
19977 return DAG.getMergeValues(Results, dl);
19979 // Get Extended Control Register.
19981 SmallVector<SDValue, 2> Results;
19982 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
19983 return DAG.getMergeValues(Results, dl);
19985 // XTEST intrinsics.
19987 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19988 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
19990 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
19991 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
19992 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
19993 Ret, SDValue(InTrans.getNode(), 1));
19997 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
19998 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
19999 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20000 DAG.getConstant(-1, dl, MVT::i8));
20001 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20002 Op.getOperand(4), GenCF.getValue(1));
20003 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20004 Op.getOperand(5), MachinePointerInfo());
20005 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20006 SDValue Results[] = { SetCC, Store };
20007 return DAG.getMergeValues(Results, dl);
20009 case COMPRESS_TO_MEM: {
20010 SDValue Mask = Op.getOperand(4);
20011 SDValue DataToCompress = Op.getOperand(3);
20012 SDValue Addr = Op.getOperand(2);
20013 SDValue Chain = Op.getOperand(0);
20014 MVT VT = DataToCompress.getSimpleValueType();
20016 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20017 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20019 if (isAllOnesConstant(Mask)) // return just a store
20020 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20021 MemIntr->getMemOperand());
20023 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20024 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20026 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20027 MemIntr->getMemOperand(),
20028 false /* truncating */, true /* compressing */);
20030 case TRUNCATE_TO_MEM_VI8:
20031 case TRUNCATE_TO_MEM_VI16:
20032 case TRUNCATE_TO_MEM_VI32: {
20033 SDValue Mask = Op.getOperand(4);
20034 SDValue DataToTruncate = Op.getOperand(3);
20035 SDValue Addr = Op.getOperand(2);
20036 SDValue Chain = Op.getOperand(0);
20038 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20039 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20041 EVT MemVT = MemIntr->getMemoryVT();
20043 uint16_t TruncationOp = IntrData->Opc0;
20044 switch (TruncationOp) {
20045 case X86ISD::VTRUNC: {
20046 if (isAllOnesConstant(Mask)) // return just a truncate store
20047 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20048 MemIntr->getMemOperand());
20050 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20051 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20053 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20054 MemIntr->getMemOperand(), true /* truncating */);
20056 case X86ISD::VTRUNCUS:
20057 case X86ISD::VTRUNCS: {
20058 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20059 if (isAllOnesConstant(Mask))
20060 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20061 MemIntr->getMemOperand(), DAG);
20063 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20064 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20066 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20067 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20070 llvm_unreachable("Unsupported truncstore intrinsic");
20074 case EXPAND_FROM_MEM: {
20075 SDValue Mask = Op.getOperand(4);
20076 SDValue PassThru = Op.getOperand(3);
20077 SDValue Addr = Op.getOperand(2);
20078 SDValue Chain = Op.getOperand(0);
20079 MVT VT = Op.getSimpleValueType();
20081 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20082 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20084 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20085 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20086 if (X86::isZeroNode(Mask))
20087 return DAG.getUNDEF(VT);
20089 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20090 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20091 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20092 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20093 true /* expanding */);
20098 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20099 SelectionDAG &DAG) const {
20100 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20101 MFI.setReturnAddressIsTaken(true);
20103 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20106 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20108 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20111 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20112 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20113 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20114 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20115 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20116 MachinePointerInfo());
20119 // Just load the return address.
20120 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20121 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20122 MachinePointerInfo());
20125 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20126 SelectionDAG &DAG) const {
20127 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20128 return getReturnAddressFrameIndex(DAG);
20131 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20132 MachineFunction &MF = DAG.getMachineFunction();
20133 MachineFrameInfo &MFI = MF.getFrameInfo();
20134 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20135 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20136 EVT VT = Op.getValueType();
20138 MFI.setFrameAddressIsTaken(true);
20140 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20141 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20142 // is not possible to crawl up the stack without looking at the unwind codes
20144 int FrameAddrIndex = FuncInfo->getFAIndex();
20145 if (!FrameAddrIndex) {
20146 // Set up a frame object for the return address.
20147 unsigned SlotSize = RegInfo->getSlotSize();
20148 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20149 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20150 FuncInfo->setFAIndex(FrameAddrIndex);
20152 return DAG.getFrameIndex(FrameAddrIndex, VT);
20155 unsigned FrameReg =
20156 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20157 SDLoc dl(Op); // FIXME probably not meaningful
20158 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20159 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20160 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20161 "Invalid Frame Register!");
20162 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20164 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20165 MachinePointerInfo());
20169 // FIXME? Maybe this could be a TableGen attribute on some registers and
20170 // this table could be generated automatically from RegInfo.
20171 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20172 SelectionDAG &DAG) const {
20173 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20174 const MachineFunction &MF = DAG.getMachineFunction();
20176 unsigned Reg = StringSwitch<unsigned>(RegName)
20177 .Case("esp", X86::ESP)
20178 .Case("rsp", X86::RSP)
20179 .Case("ebp", X86::EBP)
20180 .Case("rbp", X86::RBP)
20183 if (Reg == X86::EBP || Reg == X86::RBP) {
20184 if (!TFI.hasFP(MF))
20185 report_fatal_error("register " + StringRef(RegName) +
20186 " is allocatable: function has no frame pointer");
20189 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20190 unsigned FrameReg =
20191 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20192 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20193 "Invalid Frame Register!");
20201 report_fatal_error("Invalid register name global variable");
20204 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20205 SelectionDAG &DAG) const {
20206 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20207 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20210 unsigned X86TargetLowering::getExceptionPointerRegister(
20211 const Constant *PersonalityFn) const {
20212 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20213 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20215 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20218 unsigned X86TargetLowering::getExceptionSelectorRegister(
20219 const Constant *PersonalityFn) const {
20220 // Funclet personalities don't use selectors (the runtime does the selection).
20221 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20222 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20225 bool X86TargetLowering::needsFixedCatchObjects() const {
20226 return Subtarget.isTargetWin64();
20229 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20230 SDValue Chain = Op.getOperand(0);
20231 SDValue Offset = Op.getOperand(1);
20232 SDValue Handler = Op.getOperand(2);
20235 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20236 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20237 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20238 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20239 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20240 "Invalid Frame Register!");
20241 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20242 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20244 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20245 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20247 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20248 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20249 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20251 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20252 DAG.getRegister(StoreAddrReg, PtrVT));
20255 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20256 SelectionDAG &DAG) const {
20258 // If the subtarget is not 64bit, we may need the global base reg
20259 // after isel expand pseudo, i.e., after CGBR pass ran.
20260 // Therefore, ask for the GlobalBaseReg now, so that the pass
20261 // inserts the code for us in case we need it.
20262 // Otherwise, we will end up in a situation where we will
20263 // reference a virtual register that is not defined!
20264 if (!Subtarget.is64Bit()) {
20265 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20266 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20268 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20269 DAG.getVTList(MVT::i32, MVT::Other),
20270 Op.getOperand(0), Op.getOperand(1));
20273 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20274 SelectionDAG &DAG) const {
20276 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20277 Op.getOperand(0), Op.getOperand(1));
20280 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20281 SelectionDAG &DAG) const {
20283 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20287 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20288 return Op.getOperand(0);
20291 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20292 SelectionDAG &DAG) const {
20293 SDValue Root = Op.getOperand(0);
20294 SDValue Trmp = Op.getOperand(1); // trampoline
20295 SDValue FPtr = Op.getOperand(2); // nested function
20296 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20299 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20300 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20302 if (Subtarget.is64Bit()) {
20303 SDValue OutChains[6];
20305 // Large code-model.
20306 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20307 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20309 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20310 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20312 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20314 // Load the pointer to the nested function into R11.
20315 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20316 SDValue Addr = Trmp;
20317 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20318 Addr, MachinePointerInfo(TrmpAddr));
20320 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20321 DAG.getConstant(2, dl, MVT::i64));
20323 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20324 /* Alignment = */ 2);
20326 // Load the 'nest' parameter value into R10.
20327 // R10 is specified in X86CallingConv.td
20328 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20329 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20330 DAG.getConstant(10, dl, MVT::i64));
20331 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20332 Addr, MachinePointerInfo(TrmpAddr, 10));
20334 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20335 DAG.getConstant(12, dl, MVT::i64));
20337 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20338 /* Alignment = */ 2);
20340 // Jump to the nested function.
20341 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20342 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20343 DAG.getConstant(20, dl, MVT::i64));
20344 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20345 Addr, MachinePointerInfo(TrmpAddr, 20));
20347 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
20348 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20349 DAG.getConstant(22, dl, MVT::i64));
20350 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
20351 Addr, MachinePointerInfo(TrmpAddr, 22));
20353 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20355 const Function *Func =
20356 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
20357 CallingConv::ID CC = Func->getCallingConv();
20362 llvm_unreachable("Unsupported calling convention");
20363 case CallingConv::C:
20364 case CallingConv::X86_StdCall: {
20365 // Pass 'nest' parameter in ECX.
20366 // Must be kept in sync with X86CallingConv.td
20367 NestReg = X86::ECX;
20369 // Check that ECX wasn't needed by an 'inreg' parameter.
20370 FunctionType *FTy = Func->getFunctionType();
20371 const AttributeSet &Attrs = Func->getAttributes();
20373 if (!Attrs.isEmpty() && !Func->isVarArg()) {
20374 unsigned InRegCount = 0;
20377 for (FunctionType::param_iterator I = FTy->param_begin(),
20378 E = FTy->param_end(); I != E; ++I, ++Idx)
20379 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
20380 auto &DL = DAG.getDataLayout();
20381 // FIXME: should only count parameters that are lowered to integers.
20382 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
20385 if (InRegCount > 2) {
20386 report_fatal_error("Nest register in use - reduce number of inreg"
20392 case CallingConv::X86_FastCall:
20393 case CallingConv::X86_ThisCall:
20394 case CallingConv::Fast:
20395 // Pass 'nest' parameter in EAX.
20396 // Must be kept in sync with X86CallingConv.td
20397 NestReg = X86::EAX;
20401 SDValue OutChains[4];
20402 SDValue Addr, Disp;
20404 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20405 DAG.getConstant(10, dl, MVT::i32));
20406 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
20408 // This is storing the opcode for MOV32ri.
20409 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
20410 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
20412 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
20413 Trmp, MachinePointerInfo(TrmpAddr));
20415 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20416 DAG.getConstant(1, dl, MVT::i32));
20418 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
20419 /* Alignment = */ 1);
20421 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
20422 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20423 DAG.getConstant(5, dl, MVT::i32));
20424 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
20425 Addr, MachinePointerInfo(TrmpAddr, 5),
20426 /* Alignment = */ 1);
20428 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
20429 DAG.getConstant(6, dl, MVT::i32));
20431 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
20432 /* Alignment = */ 1);
20434 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
20438 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
20439 SelectionDAG &DAG) const {
20441 The rounding mode is in bits 11:10 of FPSR, and has the following
20443 00 Round to nearest
20448 FLT_ROUNDS, on the other hand, expects the following:
20455 To perform the conversion, we do:
20456 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
20459 MachineFunction &MF = DAG.getMachineFunction();
20460 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20461 unsigned StackAlignment = TFI.getStackAlignment();
20462 MVT VT = Op.getSimpleValueType();
20465 // Save FP Control Word to stack slot
20466 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
20467 SDValue StackSlot =
20468 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
20470 MachineMemOperand *MMO =
20471 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
20472 MachineMemOperand::MOStore, 2, 2);
20474 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
20475 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
20476 DAG.getVTList(MVT::Other),
20477 Ops, MVT::i16, MMO);
20479 // Load FP Control Word from stack slot
20481 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
20483 // Transform as necessary
20485 DAG.getNode(ISD::SRL, DL, MVT::i16,
20486 DAG.getNode(ISD::AND, DL, MVT::i16,
20487 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
20488 DAG.getConstant(11, DL, MVT::i8));
20490 DAG.getNode(ISD::SRL, DL, MVT::i16,
20491 DAG.getNode(ISD::AND, DL, MVT::i16,
20492 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
20493 DAG.getConstant(9, DL, MVT::i8));
20496 DAG.getNode(ISD::AND, DL, MVT::i16,
20497 DAG.getNode(ISD::ADD, DL, MVT::i16,
20498 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
20499 DAG.getConstant(1, DL, MVT::i16)),
20500 DAG.getConstant(3, DL, MVT::i16));
20502 return DAG.getNode((VT.getSizeInBits() < 16 ?
20503 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
20506 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
20508 // 1. i32/i64 128/256-bit vector (native support require VLX) are expended
20509 // to 512-bit vector.
20510 // 2. i8/i16 vector implemented using dword LZCNT vector instruction
20511 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
20512 // split the vector, perform operation on it's Lo a Hi part and
20513 // concatenate the results.
20514 static SDValue LowerVectorCTLZ_AVX512(SDValue Op, SelectionDAG &DAG) {
20515 assert(Op.getOpcode() == ISD::CTLZ);
20517 MVT VT = Op.getSimpleValueType();
20518 MVT EltVT = VT.getVectorElementType();
20519 unsigned NumElems = VT.getVectorNumElements();
20521 if (EltVT == MVT::i64 || EltVT == MVT::i32) {
20522 // Extend to 512 bit vector.
20523 assert((VT.is256BitVector() || VT.is128BitVector()) &&
20524 "Unsupported value type for operation");
20526 MVT NewVT = MVT::getVectorVT(EltVT, 512 / VT.getScalarSizeInBits());
20527 SDValue Vec512 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
20528 DAG.getUNDEF(NewVT),
20530 DAG.getIntPtrConstant(0, dl));
20531 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Vec512);
20533 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, CtlzNode,
20534 DAG.getIntPtrConstant(0, dl));
20537 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
20538 "Unsupported element type");
20540 if (16 < NumElems) {
20541 // Split vector, it's Lo and Hi parts will be handled in next iteration.
20543 std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
20544 MVT OutVT = MVT::getVectorVT(EltVT, NumElems/2);
20546 Lo = DAG.getNode(ISD::CTLZ, dl, OutVT, Lo);
20547 Hi = DAG.getNode(ISD::CTLZ, dl, OutVT, Hi);
20549 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
20552 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
20554 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
20555 "Unsupported value type for operation");
20557 // Use native supported vector instruction vplzcntd.
20558 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
20559 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
20560 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
20561 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
20563 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
20566 // Lower CTLZ using a PSHUFB lookup table implementation.
20567 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
20568 const X86Subtarget &Subtarget,
20569 SelectionDAG &DAG) {
20570 MVT VT = Op.getSimpleValueType();
20571 int NumElts = VT.getVectorNumElements();
20572 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
20573 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
20575 // Per-nibble leading zero PSHUFB lookup table.
20576 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
20577 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
20578 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
20579 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
20581 SmallVector<SDValue, 64> LUTVec;
20582 for (int i = 0; i < NumBytes; ++i)
20583 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
20584 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
20586 // Begin by bitcasting the input to byte vector, then split those bytes
20587 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
20588 // If the hi input nibble is zero then we add both results together, otherwise
20589 // we just take the hi result (by masking the lo result to zero before the
20591 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
20592 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
20594 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
20595 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
20596 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
20597 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
20598 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
20600 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
20601 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
20602 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
20603 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
20605 // Merge result back from vXi8 back to VT, working on the lo/hi halves
20606 // of the current vector width in the same way we did for the nibbles.
20607 // If the upper half of the input element is zero then add the halves'
20608 // leading zero counts together, otherwise just use the upper half's.
20609 // Double the width of the result until we are at target width.
20610 while (CurrVT != VT) {
20611 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
20612 int CurrNumElts = CurrVT.getVectorNumElements();
20613 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
20614 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
20615 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
20617 // Check if the upper half of the input element is zero.
20618 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
20619 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
20620 HiZ = DAG.getBitcast(NextVT, HiZ);
20622 // Move the upper/lower halves to the lower bits as we'll be extending to
20623 // NextVT. Mask the lower result to zero if HiZ is true and add the results
20625 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
20626 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
20627 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
20628 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
20629 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
20636 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
20637 const X86Subtarget &Subtarget,
20638 SelectionDAG &DAG) {
20639 MVT VT = Op.getSimpleValueType();
20640 SDValue Op0 = Op.getOperand(0);
20642 if (Subtarget.hasAVX512())
20643 return LowerVectorCTLZ_AVX512(Op, DAG);
20645 // Decompose 256-bit ops into smaller 128-bit ops.
20646 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
20647 unsigned NumElems = VT.getVectorNumElements();
20649 // Extract each 128-bit vector, perform ctlz and concat the result.
20650 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
20651 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
20653 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
20654 DAG.getNode(ISD::CTLZ, DL, LHS.getValueType(), LHS),
20655 DAG.getNode(ISD::CTLZ, DL, RHS.getValueType(), RHS));
20658 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
20659 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
20662 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
20663 SelectionDAG &DAG) {
20664 MVT VT = Op.getSimpleValueType();
20666 unsigned NumBits = VT.getSizeInBits();
20668 unsigned Opc = Op.getOpcode();
20671 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
20673 Op = Op.getOperand(0);
20674 if (VT == MVT::i8) {
20675 // Zero extend to i32 since there is not an i8 bsr.
20677 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
20680 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
20681 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
20682 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
20684 if (Opc == ISD::CTLZ) {
20685 // If src is zero (i.e. bsr sets ZF), returns NumBits.
20688 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
20689 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20692 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
20695 // Finally xor with NumBits-1.
20696 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
20697 DAG.getConstant(NumBits - 1, dl, OpVT));
20700 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
20704 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
20705 MVT VT = Op.getSimpleValueType();
20706 unsigned NumBits = VT.getScalarSizeInBits();
20709 if (VT.isVector()) {
20710 SDValue N0 = Op.getOperand(0);
20711 SDValue Zero = DAG.getConstant(0, dl, VT);
20713 // lsb(x) = (x & -x)
20714 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
20715 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
20717 // cttz_undef(x) = (width - 1) - ctlz(lsb)
20718 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
20719 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
20720 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
20721 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
20724 // cttz(x) = ctpop(lsb - 1)
20725 SDValue One = DAG.getConstant(1, dl, VT);
20726 return DAG.getNode(ISD::CTPOP, dl, VT,
20727 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
20730 assert(Op.getOpcode() == ISD::CTTZ &&
20731 "Only scalar CTTZ requires custom lowering");
20733 // Issue a bsf (scan bits forward) which also sets EFLAGS.
20734 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
20735 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
20737 // If src is zero (i.e. bsf sets ZF), returns NumBits.
20740 DAG.getConstant(NumBits, dl, VT),
20741 DAG.getConstant(X86::COND_E, dl, MVT::i8),
20744 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
20747 /// Break a 256-bit integer operation into two new 128-bit ones and then
20748 /// concatenate the result back.
20749 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
20750 MVT VT = Op.getSimpleValueType();
20752 assert(VT.is256BitVector() && VT.isInteger() &&
20753 "Unsupported value type for operation");
20755 unsigned NumElems = VT.getVectorNumElements();
20758 // Extract the LHS vectors
20759 SDValue LHS = Op.getOperand(0);
20760 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
20761 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
20763 // Extract the RHS vectors
20764 SDValue RHS = Op.getOperand(1);
20765 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
20766 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
20768 MVT EltVT = VT.getVectorElementType();
20769 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20771 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20772 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20773 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20776 /// Break a 512-bit integer operation into two new 256-bit ones and then
20777 /// concatenate the result back.
20778 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
20779 MVT VT = Op.getSimpleValueType();
20781 assert(VT.is512BitVector() && VT.isInteger() &&
20782 "Unsupported value type for operation");
20784 unsigned NumElems = VT.getVectorNumElements();
20787 // Extract the LHS vectors
20788 SDValue LHS = Op.getOperand(0);
20789 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
20790 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
20792 // Extract the RHS vectors
20793 SDValue RHS = Op.getOperand(1);
20794 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
20795 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
20797 MVT EltVT = VT.getVectorElementType();
20798 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
20800 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
20801 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
20802 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
20805 static SDValue LowerADD(SDValue Op, SelectionDAG &DAG) {
20806 if (Op.getValueType() == MVT::i1)
20807 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20808 Op.getOperand(0), Op.getOperand(1));
20809 assert(Op.getSimpleValueType().is256BitVector() &&
20810 Op.getSimpleValueType().isInteger() &&
20811 "Only handle AVX 256-bit vector integer operation");
20812 return Lower256IntArith(Op, DAG);
20815 static SDValue LowerSUB(SDValue Op, SelectionDAG &DAG) {
20816 if (Op.getValueType() == MVT::i1)
20817 return DAG.getNode(ISD::XOR, SDLoc(Op), Op.getValueType(),
20818 Op.getOperand(0), Op.getOperand(1));
20819 assert(Op.getSimpleValueType().is256BitVector() &&
20820 Op.getSimpleValueType().isInteger() &&
20821 "Only handle AVX 256-bit vector integer operation");
20822 return Lower256IntArith(Op, DAG);
20825 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
20826 assert(Op.getSimpleValueType().is256BitVector() &&
20827 Op.getSimpleValueType().isInteger() &&
20828 "Only handle AVX 256-bit vector integer operation");
20829 return Lower256IntArith(Op, DAG);
20832 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
20833 SelectionDAG &DAG) {
20835 MVT VT = Op.getSimpleValueType();
20838 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
20840 // Decompose 256-bit ops into smaller 128-bit ops.
20841 if (VT.is256BitVector() && !Subtarget.hasInt256())
20842 return Lower256IntArith(Op, DAG);
20844 SDValue A = Op.getOperand(0);
20845 SDValue B = Op.getOperand(1);
20847 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
20848 // vector pairs, multiply and truncate.
20849 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
20850 if (Subtarget.hasInt256()) {
20851 // For 512-bit vectors, split into 256-bit vectors to allow the
20852 // sign-extension to occur.
20853 if (VT == MVT::v64i8)
20854 return Lower512IntArith(Op, DAG);
20856 // For 256-bit vectors, split into 128-bit vectors to allow the
20857 // sign-extension to occur. We don't need this on AVX512BW as we can
20858 // safely sign-extend to v32i16.
20859 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
20860 return Lower256IntArith(Op, DAG);
20862 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
20863 return DAG.getNode(
20864 ISD::TRUNCATE, dl, VT,
20865 DAG.getNode(ISD::MUL, dl, ExVT,
20866 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
20867 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
20870 assert(VT == MVT::v16i8 &&
20871 "Pre-AVX2 support only supports v16i8 multiplication");
20872 MVT ExVT = MVT::v8i16;
20874 // Extract the lo parts and sign extend to i16
20876 if (Subtarget.hasSSE41()) {
20877 ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
20878 BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
20880 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
20881 -1, 4, -1, 5, -1, 6, -1, 7};
20882 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20883 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20884 ALo = DAG.getBitcast(ExVT, ALo);
20885 BLo = DAG.getBitcast(ExVT, BLo);
20886 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
20887 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
20890 // Extract the hi parts and sign extend to i16
20892 if (Subtarget.hasSSE41()) {
20893 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
20894 -1, -1, -1, -1, -1, -1, -1, -1};
20895 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20896 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20897 AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
20898 BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
20900 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
20901 -1, 12, -1, 13, -1, 14, -1, 15};
20902 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
20903 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
20904 AHi = DAG.getBitcast(ExVT, AHi);
20905 BHi = DAG.getBitcast(ExVT, BHi);
20906 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
20907 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
20910 // Multiply, mask the lower 8bits of the lo/hi results and pack
20911 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
20912 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
20913 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
20914 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
20915 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
20918 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
20919 if (VT == MVT::v4i32) {
20920 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
20921 "Should not custom lower when pmuldq is available!");
20923 // Extract the odd parts.
20924 static const int UnpackMask[] = { 1, -1, 3, -1 };
20925 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
20926 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
20928 // Multiply the even parts.
20929 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
20930 // Now multiply odd parts.
20931 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
20933 Evens = DAG.getBitcast(VT, Evens);
20934 Odds = DAG.getBitcast(VT, Odds);
20936 // Merge the two vectors back together with a shuffle. This expands into 2
20938 static const int ShufMask[] = { 0, 4, 2, 6 };
20939 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
20942 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
20943 "Only know how to lower V2I64/V4I64/V8I64 multiply");
20945 // 32-bit vector types used for MULDQ/MULUDQ.
20946 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
20948 // MULDQ returns the 64-bit result of the signed multiplication of the lower
20949 // 32-bits. We can lower with this if the sign bits stretch that far.
20950 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
20951 DAG.ComputeNumSignBits(B) > 32) {
20952 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
20953 DAG.getBitcast(MulVT, B));
20956 // Ahi = psrlqi(a, 32);
20957 // Bhi = psrlqi(b, 32);
20959 // AloBlo = pmuludq(a, b);
20960 // AloBhi = pmuludq(a, Bhi);
20961 // AhiBlo = pmuludq(Ahi, b);
20963 // Hi = psllqi(AloBhi + AhiBlo, 32);
20964 // return AloBlo + Hi;
20965 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
20966 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
20967 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
20969 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
20970 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
20971 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
20973 // Bit cast to 32-bit vectors for MULUDQ.
20974 SDValue Alo = DAG.getBitcast(MulVT, A);
20975 SDValue Blo = DAG.getBitcast(MulVT, B);
20977 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
20979 // Only multiply lo/hi halves that aren't known to be zero.
20980 SDValue AloBlo = Zero;
20981 if (!ALoIsZero && !BLoIsZero)
20982 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
20984 SDValue AloBhi = Zero;
20985 if (!ALoIsZero && !BHiIsZero) {
20986 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
20987 Bhi = DAG.getBitcast(MulVT, Bhi);
20988 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
20991 SDValue AhiBlo = Zero;
20992 if (!AHiIsZero && !BLoIsZero) {
20993 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
20994 Ahi = DAG.getBitcast(MulVT, Ahi);
20995 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
20998 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
20999 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21001 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21004 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21005 SelectionDAG &DAG) {
21007 MVT VT = Op.getSimpleValueType();
21009 // Decompose 256-bit ops into smaller 128-bit ops.
21010 if (VT.is256BitVector() && !Subtarget.hasInt256())
21011 return Lower256IntArith(Op, DAG);
21013 // Only i8 vectors should need custom lowering after this.
21014 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21015 "Unsupported vector type");
21017 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21018 // logical shift down the upper half and pack back to i8.
21019 SDValue A = Op.getOperand(0);
21020 SDValue B = Op.getOperand(1);
21022 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21023 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21024 unsigned Opcode = Op.getOpcode();
21025 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21026 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21028 // AVX2 implementations - extend xmm subvectors to ymm.
21029 if (Subtarget.hasInt256()) {
21030 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21031 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21033 if (VT == MVT::v32i8) {
21034 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21035 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21036 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21037 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21038 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21039 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21040 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21041 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21042 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21043 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21044 DAG.getConstant(8, dl, MVT::v16i16));
21045 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21046 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21047 DAG.getConstant(8, dl, MVT::v16i16));
21048 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21049 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21050 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21051 16, 17, 18, 19, 20, 21, 22, 23};
21052 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21053 24, 25, 26, 27, 28, 29, 30, 31};
21054 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21055 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21056 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21059 SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
21060 SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
21061 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21062 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21063 DAG.getConstant(8, dl, MVT::v16i16));
21064 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21065 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21066 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21069 assert(VT == MVT::v16i8 &&
21070 "Pre-AVX2 support only supports v16i8 multiplication");
21071 MVT ExVT = MVT::v8i16;
21073 // Extract the lo parts and zero/sign extend to i16.
21075 if (Subtarget.hasSSE41()) {
21076 ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
21077 BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
21079 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21080 -1, 4, -1, 5, -1, 6, -1, 7};
21081 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21082 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21083 ALo = DAG.getBitcast(ExVT, ALo);
21084 BLo = DAG.getBitcast(ExVT, BLo);
21085 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21086 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21089 // Extract the hi parts and zero/sign extend to i16.
21091 if (Subtarget.hasSSE41()) {
21092 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21093 -1, -1, -1, -1, -1, -1, -1, -1};
21094 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21095 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21096 AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
21097 BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
21099 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21100 -1, 12, -1, 13, -1, 14, -1, 15};
21101 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21102 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21103 AHi = DAG.getBitcast(ExVT, AHi);
21104 BHi = DAG.getBitcast(ExVT, BHi);
21105 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21106 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21109 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21110 // pack back to v16i8.
21111 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21112 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21113 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21114 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21115 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21118 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21119 assert(Subtarget.isTargetWin64() && "Unexpected target");
21120 EVT VT = Op.getValueType();
21121 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21122 "Unexpected return type for lowering");
21126 switch (Op->getOpcode()) {
21127 default: llvm_unreachable("Unexpected request for libcall!");
21128 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21129 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21130 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21131 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21132 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21133 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21137 SDValue InChain = DAG.getEntryNode();
21139 TargetLowering::ArgListTy Args;
21140 TargetLowering::ArgListEntry Entry;
21141 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21142 EVT ArgVT = Op->getOperand(i).getValueType();
21143 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21144 "Unexpected argument type for lowering");
21145 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21146 Entry.Node = StackPtr;
21147 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21148 MachinePointerInfo(), /* Alignment = */ 16);
21149 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21150 Entry.Ty = PointerType::get(ArgTy,0);
21151 Entry.isSExt = false;
21152 Entry.isZExt = false;
21153 Args.push_back(Entry);
21156 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21157 getPointerTy(DAG.getDataLayout()));
21159 TargetLowering::CallLoweringInfo CLI(DAG);
21160 CLI.setDebugLoc(dl).setChain(InChain)
21161 .setCallee(getLibcallCallingConv(LC),
21162 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
21163 Callee, std::move(Args))
21164 .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
21166 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21167 return DAG.getBitcast(VT, CallInfo.first);
21170 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21171 SelectionDAG &DAG) {
21172 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21173 MVT VT = Op0.getSimpleValueType();
21176 // Decompose 256-bit ops into smaller 128-bit ops.
21177 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21178 unsigned Opcode = Op.getOpcode();
21179 unsigned NumElems = VT.getVectorNumElements();
21180 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21181 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21182 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21183 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21184 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21185 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21186 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21188 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21189 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21191 return DAG.getMergeValues(Ops, dl);
21194 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21195 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21197 // PMULxD operations multiply each even value (starting at 0) of LHS with
21198 // the related value of RHS and produce a widen result.
21199 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21200 // => <2 x i64> <ae|cg>
21202 // In other word, to have all the results, we need to perform two PMULxD:
21203 // 1. one with the even values.
21204 // 2. one with the odd values.
21205 // To achieve #2, with need to place the odd values at an even position.
21207 // Place the odd value at an even position (basically, shift all values 1
21208 // step to the left):
21209 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21210 // <a|b|c|d> => <b|undef|d|undef>
21211 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21212 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21213 // <e|f|g|h> => <f|undef|h|undef>
21214 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21215 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21217 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21219 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21220 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21222 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21223 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21224 // => <2 x i64> <ae|cg>
21225 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21226 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21227 // => <2 x i64> <bf|dh>
21228 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21230 // Shuffle it back into the right order.
21231 SDValue Highs, Lows;
21232 if (VT == MVT::v8i32) {
21233 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21234 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21235 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21236 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21238 const int HighMask[] = {1, 5, 3, 7};
21239 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21240 const int LowMask[] = {0, 4, 2, 6};
21241 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21244 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21245 // unsigned multiply.
21246 if (IsSigned && !Subtarget.hasSSE41()) {
21247 SDValue ShAmt = DAG.getConstant(
21249 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21250 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21251 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21252 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21253 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21255 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21256 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21259 // The first result of MUL_LOHI is actually the low value, followed by the
21261 SDValue Ops[] = {Lows, Highs};
21262 return DAG.getMergeValues(Ops, dl);
21265 // Return true if the required (according to Opcode) shift-imm form is natively
21266 // supported by the Subtarget
21267 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21269 if (VT.getScalarSizeInBits() < 16)
21272 if (VT.is512BitVector() &&
21273 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21276 bool LShift = VT.is128BitVector() ||
21277 (VT.is256BitVector() && Subtarget.hasInt256());
21279 bool AShift = LShift && (Subtarget.hasVLX() ||
21280 (VT != MVT::v2i64 && VT != MVT::v4i64));
21281 return (Opcode == ISD::SRA) ? AShift : LShift;
21284 // The shift amount is a variable, but it is the same for all vector lanes.
21285 // These instructions are defined together with shift-immediate.
21287 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21289 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21292 // Return true if the required (according to Opcode) variable-shift form is
21293 // natively supported by the Subtarget
21294 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21297 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21300 // vXi16 supported only on AVX-512, BWI
21301 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21304 if (VT.is512BitVector() || Subtarget.hasVLX())
21307 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21308 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21309 return (Opcode == ISD::SRA) ? AShift : LShift;
21312 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21313 const X86Subtarget &Subtarget) {
21314 MVT VT = Op.getSimpleValueType();
21316 SDValue R = Op.getOperand(0);
21317 SDValue Amt = Op.getOperand(1);
21319 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21320 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21322 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21323 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21324 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21325 SDValue Ex = DAG.getBitcast(ExVT, R);
21327 if (ShiftAmt >= 32) {
21328 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21330 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
21331 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21332 ShiftAmt - 32, DAG);
21333 if (VT == MVT::v2i64)
21334 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
21335 if (VT == MVT::v4i64)
21336 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21337 {9, 1, 11, 3, 13, 5, 15, 7});
21339 // SRA upper i32, SHL whole i64 and select lower i32.
21340 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
21343 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
21344 Lower = DAG.getBitcast(ExVT, Lower);
21345 if (VT == MVT::v2i64)
21346 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
21347 if (VT == MVT::v4i64)
21348 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
21349 {8, 1, 10, 3, 12, 5, 14, 7});
21351 return DAG.getBitcast(VT, Ex);
21354 // Optimize shl/srl/sra with constant shift amount.
21355 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
21356 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
21357 uint64_t ShiftAmt = ShiftConst->getZExtValue();
21359 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21360 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21362 // i64 SRA needs to be performed as partial shifts.
21363 if ((VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
21364 Op.getOpcode() == ISD::SRA && !Subtarget.hasXOP())
21365 return ArithmeticShiftRight64(ShiftAmt);
21367 if (VT == MVT::v16i8 ||
21368 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
21369 VT == MVT::v64i8) {
21370 unsigned NumElts = VT.getVectorNumElements();
21371 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
21373 // Simple i8 add case
21374 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
21375 return DAG.getNode(ISD::ADD, dl, VT, R, R);
21377 // ashr(R, 7) === cmp_slt(R, 0)
21378 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
21379 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
21380 if (VT.is512BitVector()) {
21381 assert(VT == MVT::v64i8 && "Unexpected element type!");
21382 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
21383 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
21385 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
21388 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
21389 if (VT == MVT::v16i8 && Subtarget.hasXOP())
21392 if (Op.getOpcode() == ISD::SHL) {
21393 // Make a large shift.
21394 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
21396 SHL = DAG.getBitcast(VT, SHL);
21397 // Zero out the rightmost bits.
21398 return DAG.getNode(ISD::AND, dl, VT, SHL,
21399 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
21401 if (Op.getOpcode() == ISD::SRL) {
21402 // Make a large shift.
21403 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
21405 SRL = DAG.getBitcast(VT, SRL);
21406 // Zero out the leftmost bits.
21407 return DAG.getNode(ISD::AND, dl, VT, SRL,
21408 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
21410 if (Op.getOpcode() == ISD::SRA) {
21411 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
21412 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21414 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
21415 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
21416 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
21419 llvm_unreachable("Unknown shift opcode.");
21424 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21425 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
21426 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
21427 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
21429 // Peek through any splat that was introduced for i64 shift vectorization.
21430 int SplatIndex = -1;
21431 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
21432 if (SVN->isSplat()) {
21433 SplatIndex = SVN->getSplatIndex();
21434 Amt = Amt.getOperand(0);
21435 assert(SplatIndex < (int)VT.getVectorNumElements() &&
21436 "Splat shuffle referencing second operand");
21439 if (Amt.getOpcode() != ISD::BITCAST ||
21440 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
21443 Amt = Amt.getOperand(0);
21444 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21445 VT.getVectorNumElements();
21446 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
21447 uint64_t ShiftAmt = 0;
21448 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
21449 for (unsigned i = 0; i != Ratio; ++i) {
21450 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
21454 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
21457 // Check remaining shift amounts (if not a splat).
21458 if (SplatIndex < 0) {
21459 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21460 uint64_t ShAmt = 0;
21461 for (unsigned j = 0; j != Ratio; ++j) {
21462 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
21466 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
21468 if (ShAmt != ShiftAmt)
21473 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
21474 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
21476 if (Op.getOpcode() == ISD::SRA)
21477 return ArithmeticShiftRight64(ShiftAmt);
21483 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
21484 const X86Subtarget &Subtarget) {
21485 MVT VT = Op.getSimpleValueType();
21487 SDValue R = Op.getOperand(0);
21488 SDValue Amt = Op.getOperand(1);
21490 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21491 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21493 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
21494 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
21496 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
21498 MVT EltVT = VT.getVectorElementType();
21500 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
21501 // Check if this build_vector node is doing a splat.
21502 // If so, then set BaseShAmt equal to the splat value.
21503 BaseShAmt = BV->getSplatValue();
21504 if (BaseShAmt && BaseShAmt.isUndef())
21505 BaseShAmt = SDValue();
21507 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
21508 Amt = Amt.getOperand(0);
21510 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
21511 if (SVN && SVN->isSplat()) {
21512 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
21513 SDValue InVec = Amt.getOperand(0);
21514 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
21515 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
21516 "Unexpected shuffle index found!");
21517 BaseShAmt = InVec.getOperand(SplatIdx);
21518 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
21519 if (ConstantSDNode *C =
21520 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
21521 if (C->getZExtValue() == SplatIdx)
21522 BaseShAmt = InVec.getOperand(1);
21527 // Avoid introducing an extract element from a shuffle.
21528 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
21529 DAG.getIntPtrConstant(SplatIdx, dl));
21533 if (BaseShAmt.getNode()) {
21534 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
21535 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
21536 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
21537 else if (EltVT.bitsLT(MVT::i32))
21538 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
21540 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
21544 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
21545 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
21546 Amt.getOpcode() == ISD::BITCAST &&
21547 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
21548 Amt = Amt.getOperand(0);
21549 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
21550 VT.getVectorNumElements();
21551 std::vector<SDValue> Vals(Ratio);
21552 for (unsigned i = 0; i != Ratio; ++i)
21553 Vals[i] = Amt.getOperand(i);
21554 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
21555 for (unsigned j = 0; j != Ratio; ++j)
21556 if (Vals[j] != Amt.getOperand(i + j))
21560 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
21561 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
21566 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
21567 SelectionDAG &DAG) {
21568 MVT VT = Op.getSimpleValueType();
21570 SDValue R = Op.getOperand(0);
21571 SDValue Amt = Op.getOperand(1);
21572 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21574 assert(VT.isVector() && "Custom lowering only for vector shifts!");
21575 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
21577 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
21580 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
21583 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
21586 // XOP has 128-bit variable logical/arithmetic shifts.
21587 // +ve/-ve Amt = shift left/right.
21588 if (Subtarget.hasXOP() &&
21589 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
21590 VT == MVT::v8i16 || VT == MVT::v16i8)) {
21591 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
21592 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21593 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
21595 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
21596 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
21597 if (Op.getOpcode() == ISD::SRA)
21598 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
21601 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
21602 // shifts per-lane and then shuffle the partial results back together.
21603 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
21604 // Splat the shift amounts so the scalar shifts above will catch it.
21605 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
21606 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
21607 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
21608 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
21609 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
21612 // i64 vector arithmetic shift can be emulated with the transform:
21613 // M = lshr(SIGN_BIT, Amt)
21614 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
21615 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
21616 Op.getOpcode() == ISD::SRA) {
21617 SDValue S = DAG.getConstant(APInt::getSignBit(64), dl, VT);
21618 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
21619 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
21620 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
21621 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
21625 // If possible, lower this packed shift into a vector multiply instead of
21626 // expanding it into a sequence of scalar shifts.
21627 // Do this only if the vector shift count is a constant build_vector.
21628 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
21629 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
21630 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
21631 SmallVector<SDValue, 8> Elts;
21632 MVT SVT = VT.getVectorElementType();
21633 unsigned SVTBits = SVT.getSizeInBits();
21634 APInt One(SVTBits, 1);
21635 unsigned NumElems = VT.getVectorNumElements();
21637 for (unsigned i=0; i !=NumElems; ++i) {
21638 SDValue Op = Amt->getOperand(i);
21639 if (Op->isUndef()) {
21640 Elts.push_back(Op);
21644 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
21645 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
21646 uint64_t ShAmt = C.getZExtValue();
21647 if (ShAmt >= SVTBits) {
21648 Elts.push_back(DAG.getUNDEF(SVT));
21651 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
21653 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
21654 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
21657 // Lower SHL with variable shift amount.
21658 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
21659 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
21661 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
21662 DAG.getConstant(0x3f800000U, dl, VT));
21663 Op = DAG.getBitcast(MVT::v4f32, Op);
21664 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
21665 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
21668 // If possible, lower this shift as a sequence of two shifts by
21669 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
21671 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
21673 // Could be rewritten as:
21674 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
21676 // The advantage is that the two shifts from the example would be
21677 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
21678 // the vector shift into four scalar shifts plus four pairs of vector
21680 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
21681 unsigned TargetOpcode = X86ISD::MOVSS;
21682 bool CanBeSimplified;
21683 // The splat value for the first packed shift (the 'X' from the example).
21684 SDValue Amt1 = Amt->getOperand(0);
21685 // The splat value for the second packed shift (the 'Y' from the example).
21686 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
21688 // See if it is possible to replace this node with a sequence of
21689 // two shifts followed by a MOVSS/MOVSD/PBLEND.
21690 if (VT == MVT::v4i32) {
21691 // Check if it is legal to use a MOVSS.
21692 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
21693 Amt2 == Amt->getOperand(3);
21694 if (!CanBeSimplified) {
21695 // Otherwise, check if we can still simplify this node using a MOVSD.
21696 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
21697 Amt->getOperand(2) == Amt->getOperand(3);
21698 TargetOpcode = X86ISD::MOVSD;
21699 Amt2 = Amt->getOperand(2);
21702 // Do similar checks for the case where the machine value type
21704 CanBeSimplified = Amt1 == Amt->getOperand(1);
21705 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
21706 CanBeSimplified = Amt2 == Amt->getOperand(i);
21708 if (!CanBeSimplified) {
21709 TargetOpcode = X86ISD::MOVSD;
21710 CanBeSimplified = true;
21711 Amt2 = Amt->getOperand(4);
21712 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
21713 CanBeSimplified = Amt1 == Amt->getOperand(i);
21714 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
21715 CanBeSimplified = Amt2 == Amt->getOperand(j);
21719 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
21720 isa<ConstantSDNode>(Amt2)) {
21721 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
21722 MVT CastVT = MVT::v4i32;
21724 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
21725 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
21727 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
21728 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
21729 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
21730 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
21731 if (TargetOpcode == X86ISD::MOVSD)
21732 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21733 BitCast2, {0, 1, 6, 7}));
21734 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
21735 BitCast2, {0, 5, 6, 7}));
21739 // v4i32 Non Uniform Shifts.
21740 // If the shift amount is constant we can shift each lane using the SSE2
21741 // immediate shifts, else we need to zero-extend each lane to the lower i64
21742 // and shift using the SSE2 variable shifts.
21743 // The separate results can then be blended together.
21744 if (VT == MVT::v4i32) {
21745 unsigned Opc = Op.getOpcode();
21746 SDValue Amt0, Amt1, Amt2, Amt3;
21748 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
21749 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
21750 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
21751 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
21753 // ISD::SHL is handled above but we include it here for completeness.
21756 llvm_unreachable("Unknown target vector shift node");
21758 Opc = X86ISD::VSHL;
21761 Opc = X86ISD::VSRL;
21764 Opc = X86ISD::VSRA;
21767 // The SSE2 shifts use the lower i64 as the same shift amount for
21768 // all lanes and the upper i64 is ignored. These shuffle masks
21769 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
21770 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21771 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
21772 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
21773 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
21774 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
21777 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
21778 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
21779 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
21780 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
21781 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
21782 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
21783 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
21786 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
21787 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
21788 // make the existing SSE solution better.
21789 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
21790 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
21791 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
21792 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
21793 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
21794 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
21796 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
21797 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
21798 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
21799 return DAG.getNode(ISD::TRUNCATE, dl, VT,
21800 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
21803 if (VT == MVT::v16i8 ||
21804 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
21805 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
21806 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
21807 unsigned ShiftOpcode = Op->getOpcode();
21809 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
21810 if (VT.is512BitVector()) {
21811 // On AVX512BW targets we make use of the fact that VSELECT lowers
21812 // to a masked blend which selects bytes based just on the sign bit
21813 // extracted to a mask.
21814 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
21815 V0 = DAG.getBitcast(VT, V0);
21816 V1 = DAG.getBitcast(VT, V1);
21817 Sel = DAG.getBitcast(VT, Sel);
21818 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
21819 return DAG.getBitcast(SelVT,
21820 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
21821 } else if (Subtarget.hasSSE41()) {
21822 // On SSE41 targets we make use of the fact that VSELECT lowers
21823 // to PBLENDVB which selects bytes based just on the sign bit.
21824 V0 = DAG.getBitcast(VT, V0);
21825 V1 = DAG.getBitcast(VT, V1);
21826 Sel = DAG.getBitcast(VT, Sel);
21827 return DAG.getBitcast(SelVT,
21828 DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1));
21830 // On pre-SSE41 targets we test for the sign bit by comparing to
21831 // zero - a negative value will set all bits of the lanes to true
21832 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
21833 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
21834 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
21835 return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1);
21838 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
21839 // We can safely do this using i16 shifts as we're only interested in
21840 // the 3 lower bits of each byte.
21841 Amt = DAG.getBitcast(ExtVT, Amt);
21842 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
21843 Amt = DAG.getBitcast(VT, Amt);
21845 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
21846 // r = VSELECT(r, shift(r, 4), a);
21848 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21849 R = SignBitSelect(VT, Amt, M, R);
21852 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21854 // r = VSELECT(r, shift(r, 2), a);
21855 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21856 R = SignBitSelect(VT, Amt, M, R);
21859 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21861 // return VSELECT(r, shift(r, 1), a);
21862 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
21863 R = SignBitSelect(VT, Amt, M, R);
21867 if (Op->getOpcode() == ISD::SRA) {
21868 // For SRA we need to unpack each byte to the higher byte of a i16 vector
21869 // so we can correctly sign extend. We don't care what happens to the
21871 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
21872 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
21873 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
21874 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
21875 ALo = DAG.getBitcast(ExtVT, ALo);
21876 AHi = DAG.getBitcast(ExtVT, AHi);
21877 RLo = DAG.getBitcast(ExtVT, RLo);
21878 RHi = DAG.getBitcast(ExtVT, RHi);
21880 // r = VSELECT(r, shift(r, 4), a);
21881 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21882 DAG.getConstant(4, dl, ExtVT));
21883 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21884 DAG.getConstant(4, dl, ExtVT));
21885 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21886 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21889 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21890 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21892 // r = VSELECT(r, shift(r, 2), a);
21893 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21894 DAG.getConstant(2, dl, ExtVT));
21895 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21896 DAG.getConstant(2, dl, ExtVT));
21897 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21898 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21901 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
21902 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
21904 // r = VSELECT(r, shift(r, 1), a);
21905 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
21906 DAG.getConstant(1, dl, ExtVT));
21907 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
21908 DAG.getConstant(1, dl, ExtVT));
21909 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
21910 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
21912 // Logical shift the result back to the lower byte, leaving a zero upper
21914 // meaning that we can safely pack with PACKUSWB.
21916 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
21918 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
21919 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21923 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
21924 MVT ExtVT = MVT::v8i32;
21925 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
21926 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
21927 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
21928 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
21929 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
21930 ALo = DAG.getBitcast(ExtVT, ALo);
21931 AHi = DAG.getBitcast(ExtVT, AHi);
21932 RLo = DAG.getBitcast(ExtVT, RLo);
21933 RHi = DAG.getBitcast(ExtVT, RHi);
21934 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
21935 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
21936 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
21937 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
21938 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21941 if (VT == MVT::v8i16) {
21942 unsigned ShiftOpcode = Op->getOpcode();
21944 // If we have a constant shift amount, the non-SSE41 path is best as
21945 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
21946 bool UseSSE41 = Subtarget.hasSSE41() &&
21947 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
21949 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
21950 // On SSE41 targets we make use of the fact that VSELECT lowers
21951 // to PBLENDVB which selects bytes based just on the sign bit.
21953 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
21954 V0 = DAG.getBitcast(ExtVT, V0);
21955 V1 = DAG.getBitcast(ExtVT, V1);
21956 Sel = DAG.getBitcast(ExtVT, Sel);
21957 return DAG.getBitcast(
21958 VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1));
21960 // On pre-SSE41 targets we splat the sign bit - a negative value will
21961 // set all bits of the lanes to true and VSELECT uses that in
21962 // its OR(AND(V0,C),AND(V1,~C)) lowering.
21964 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
21965 return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1);
21968 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
21970 // On SSE41 targets we need to replicate the shift mask in both
21971 // bytes for PBLENDVB.
21974 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
21975 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
21977 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
21980 // r = VSELECT(r, shift(r, 8), a);
21981 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
21982 R = SignBitSelect(Amt, M, R);
21985 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21987 // r = VSELECT(r, shift(r, 4), a);
21988 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
21989 R = SignBitSelect(Amt, M, R);
21992 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
21994 // r = VSELECT(r, shift(r, 2), a);
21995 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
21996 R = SignBitSelect(Amt, M, R);
21999 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22001 // return VSELECT(r, shift(r, 1), a);
22002 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22003 R = SignBitSelect(Amt, M, R);
22007 // Decompose 256-bit shifts into smaller 128-bit shifts.
22008 if (VT.is256BitVector())
22009 return Lower256IntArith(Op, DAG);
22014 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22015 SelectionDAG &DAG) {
22016 MVT VT = Op.getSimpleValueType();
22018 SDValue R = Op.getOperand(0);
22019 SDValue Amt = Op.getOperand(1);
22021 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22022 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22023 assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported");
22025 // XOP has 128-bit vector variable + immediate rotates.
22026 // +ve/-ve Amt = rotate left/right.
22028 // Split 256-bit integers.
22029 if (VT.is256BitVector())
22030 return Lower256IntArith(Op, DAG);
22032 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22034 // Attempt to rotate by immediate.
22035 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22036 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22037 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22038 assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range");
22039 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22040 DAG.getConstant(RotateAmt, DL, MVT::i8));
22044 // Use general rotate by variable (per-element).
22045 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22048 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22049 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22050 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22051 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22052 // has only one use.
22053 SDNode *N = Op.getNode();
22054 SDValue LHS = N->getOperand(0);
22055 SDValue RHS = N->getOperand(1);
22056 unsigned BaseOp = 0;
22057 X86::CondCode Cond;
22059 switch (Op.getOpcode()) {
22060 default: llvm_unreachable("Unknown ovf instruction!");
22062 // A subtract of one will be selected as a INC. Note that INC doesn't
22063 // set CF, so we can't do this for UADDO.
22064 if (isOneConstant(RHS)) {
22065 BaseOp = X86ISD::INC;
22066 Cond = X86::COND_O;
22069 BaseOp = X86ISD::ADD;
22070 Cond = X86::COND_O;
22073 BaseOp = X86ISD::ADD;
22074 Cond = X86::COND_B;
22077 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22078 // set CF, so we can't do this for USUBO.
22079 if (isOneConstant(RHS)) {
22080 BaseOp = X86ISD::DEC;
22081 Cond = X86::COND_O;
22084 BaseOp = X86ISD::SUB;
22085 Cond = X86::COND_O;
22088 BaseOp = X86ISD::SUB;
22089 Cond = X86::COND_B;
22092 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22093 Cond = X86::COND_O;
22095 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22096 if (N->getValueType(0) == MVT::i8) {
22097 BaseOp = X86ISD::UMUL8;
22098 Cond = X86::COND_O;
22101 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22103 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22105 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22107 if (N->getValueType(1) == MVT::i1)
22108 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22110 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22114 // Also sets EFLAGS.
22115 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22116 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22118 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22120 if (N->getValueType(1) == MVT::i1)
22121 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22123 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22126 /// Returns true if the operand type is exactly twice the native width, and
22127 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22128 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22129 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22130 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22131 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22134 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22135 else if (OpWidth == 128)
22136 return Subtarget.hasCmpxchg16b();
22141 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22142 return needsCmpXchgNb(SI->getValueOperand()->getType());
22145 // Note: this turns large loads into lock cmpxchg8b/16b.
22146 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22147 TargetLowering::AtomicExpansionKind
22148 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22149 auto PTy = cast<PointerType>(LI->getPointerOperand()->getType());
22150 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22151 : AtomicExpansionKind::None;
22154 TargetLowering::AtomicExpansionKind
22155 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22156 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22157 Type *MemType = AI->getType();
22159 // If the operand is too big, we must see if cmpxchg8/16b is available
22160 // and default to library calls otherwise.
22161 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22162 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22163 : AtomicExpansionKind::None;
22166 AtomicRMWInst::BinOp Op = AI->getOperation();
22169 llvm_unreachable("Unknown atomic operation");
22170 case AtomicRMWInst::Xchg:
22171 case AtomicRMWInst::Add:
22172 case AtomicRMWInst::Sub:
22173 // It's better to use xadd, xsub or xchg for these in all cases.
22174 return AtomicExpansionKind::None;
22175 case AtomicRMWInst::Or:
22176 case AtomicRMWInst::And:
22177 case AtomicRMWInst::Xor:
22178 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22179 // prefix to a normal instruction for these operations.
22180 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22181 : AtomicExpansionKind::None;
22182 case AtomicRMWInst::Nand:
22183 case AtomicRMWInst::Max:
22184 case AtomicRMWInst::Min:
22185 case AtomicRMWInst::UMax:
22186 case AtomicRMWInst::UMin:
22187 // These always require a non-trivial set of data operations on x86. We must
22188 // use a cmpxchg loop.
22189 return AtomicExpansionKind::CmpXChg;
22194 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22195 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22196 Type *MemType = AI->getType();
22197 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22198 // there is no benefit in turning such RMWs into loads, and it is actually
22199 // harmful as it introduces a mfence.
22200 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22203 auto Builder = IRBuilder<>(AI);
22204 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22205 auto SynchScope = AI->getSynchScope();
22206 // We must restrict the ordering to avoid generating loads with Release or
22207 // ReleaseAcquire orderings.
22208 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22209 auto Ptr = AI->getPointerOperand();
22211 // Before the load we need a fence. Here is an example lifted from
22212 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22215 // x.store(1, relaxed);
22216 // r1 = y.fetch_add(0, release);
22218 // y.fetch_add(42, acquire);
22219 // r2 = x.load(relaxed);
22220 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22221 // lowered to just a load without a fence. A mfence flushes the store buffer,
22222 // making the optimization clearly correct.
22223 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22224 // otherwise, we might be able to be more aggressive on relaxed idempotent
22225 // rmw. In practice, they do not look useful, so we don't try to be
22226 // especially clever.
22227 if (SynchScope == SingleThread)
22228 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22229 // the IR level, so we must wrap it in an intrinsic.
22232 if (!Subtarget.hasMFence())
22233 // FIXME: it might make sense to use a locked operation here but on a
22234 // different cache-line to prevent cache-line bouncing. In practice it
22235 // is probably a small win, and x86 processors without mfence are rare
22236 // enough that we do not bother.
22240 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22241 Builder.CreateCall(MFence, {});
22243 // Finally we can emit the atomic load.
22244 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22245 AI->getType()->getPrimitiveSizeInBits());
22246 Loaded->setAtomic(Order, SynchScope);
22247 AI->replaceAllUsesWith(Loaded);
22248 AI->eraseFromParent();
22252 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22253 SelectionDAG &DAG) {
22255 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22256 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22257 SynchronizationScope FenceScope = static_cast<SynchronizationScope>(
22258 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22260 // The only fence that needs an instruction is a sequentially-consistent
22261 // cross-thread fence.
22262 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22263 FenceScope == CrossThread) {
22264 if (Subtarget.hasMFence())
22265 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22267 SDValue Chain = Op.getOperand(0);
22268 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22270 DAG.getRegister(X86::ESP, MVT::i32), // Base
22271 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22272 DAG.getRegister(0, MVT::i32), // Index
22273 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22274 DAG.getRegister(0, MVT::i32), // Segment.
22278 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22279 return SDValue(Res, 0);
22282 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22283 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22286 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22287 SelectionDAG &DAG) {
22288 MVT T = Op.getSimpleValueType();
22292 switch(T.SimpleTy) {
22293 default: llvm_unreachable("Invalid value type!");
22294 case MVT::i8: Reg = X86::AL; size = 1; break;
22295 case MVT::i16: Reg = X86::AX; size = 2; break;
22296 case MVT::i32: Reg = X86::EAX; size = 4; break;
22298 assert(Subtarget.is64Bit() && "Node not type legal!");
22299 Reg = X86::RAX; size = 8;
22302 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
22303 Op.getOperand(2), SDValue());
22304 SDValue Ops[] = { cpIn.getValue(0),
22307 DAG.getTargetConstant(size, DL, MVT::i8),
22308 cpIn.getValue(1) };
22309 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
22310 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
22311 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
22315 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
22316 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
22317 MVT::i32, cpOut.getValue(2));
22318 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
22320 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
22321 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
22322 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
22326 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
22327 SelectionDAG &DAG) {
22328 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
22329 MVT DstVT = Op.getSimpleValueType();
22331 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
22332 SrcVT == MVT::i64) {
22333 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
22334 if (DstVT != MVT::f64)
22335 // This conversion needs to be expanded.
22338 SDValue Op0 = Op->getOperand(0);
22339 SmallVector<SDValue, 16> Elts;
22343 if (SrcVT.isVector()) {
22344 NumElts = SrcVT.getVectorNumElements();
22345 SVT = SrcVT.getVectorElementType();
22347 // Widen the vector in input in the case of MVT::v2i32.
22348 // Example: from MVT::v2i32 to MVT::v4i32.
22349 for (unsigned i = 0, e = NumElts; i != e; ++i)
22350 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
22351 DAG.getIntPtrConstant(i, dl)));
22353 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
22354 "Unexpected source type in LowerBITCAST");
22355 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22356 DAG.getIntPtrConstant(0, dl)));
22357 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
22358 DAG.getIntPtrConstant(1, dl)));
22362 // Explicitly mark the extra elements as Undef.
22363 Elts.append(NumElts, DAG.getUNDEF(SVT));
22365 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
22366 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
22367 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
22368 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
22369 DAG.getIntPtrConstant(0, dl));
22372 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
22373 Subtarget.hasMMX() && "Unexpected custom BITCAST");
22374 assert((DstVT == MVT::i64 ||
22375 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
22376 "Unexpected custom BITCAST");
22377 // i64 <=> MMX conversions are Legal.
22378 if (SrcVT==MVT::i64 && DstVT.isVector())
22380 if (DstVT==MVT::i64 && SrcVT.isVector())
22382 // MMX <=> MMX conversions are Legal.
22383 if (SrcVT.isVector() && DstVT.isVector())
22385 // All other conversions need to be expanded.
22389 /// Compute the horizontal sum of bytes in V for the elements of VT.
22391 /// Requires V to be a byte vector and VT to be an integer vector type with
22392 /// wider elements than V's type. The width of the elements of VT determines
22393 /// how many bytes of V are summed horizontally to produce each element of the
22395 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
22396 const X86Subtarget &Subtarget,
22397 SelectionDAG &DAG) {
22399 MVT ByteVecVT = V.getSimpleValueType();
22400 MVT EltVT = VT.getVectorElementType();
22401 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
22402 "Expected value to have byte element type.");
22403 assert(EltVT != MVT::i8 &&
22404 "Horizontal byte sum only makes sense for wider elements!");
22405 unsigned VecSize = VT.getSizeInBits();
22406 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
22408 // PSADBW instruction horizontally add all bytes and leave the result in i64
22409 // chunks, thus directly computes the pop count for v2i64 and v4i64.
22410 if (EltVT == MVT::i64) {
22411 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22412 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22413 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
22414 return DAG.getBitcast(VT, V);
22417 if (EltVT == MVT::i32) {
22418 // We unpack the low half and high half into i32s interleaved with zeros so
22419 // that we can use PSADBW to horizontally sum them. The most useful part of
22420 // this is that it lines up the results of two PSADBW instructions to be
22421 // two v2i64 vectors which concatenated are the 4 population counts. We can
22422 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
22423 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
22424 SDValue V32 = DAG.getBitcast(VT, V);
22425 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
22426 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
22428 // Do the horizontal sums into two v2i64s.
22429 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
22430 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
22431 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22432 DAG.getBitcast(ByteVecVT, Low), Zeros);
22433 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
22434 DAG.getBitcast(ByteVecVT, High), Zeros);
22436 // Merge them together.
22437 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
22438 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
22439 DAG.getBitcast(ShortVecVT, Low),
22440 DAG.getBitcast(ShortVecVT, High));
22442 return DAG.getBitcast(VT, V);
22445 // The only element type left is i16.
22446 assert(EltVT == MVT::i16 && "Unknown how to handle type");
22448 // To obtain pop count for each i16 element starting from the pop count for
22449 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
22450 // right by 8. It is important to shift as i16s as i8 vector shift isn't
22451 // directly supported.
22452 SDValue ShifterV = DAG.getConstant(8, DL, VT);
22453 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22454 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
22455 DAG.getBitcast(ByteVecVT, V));
22456 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
22459 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
22460 const X86Subtarget &Subtarget,
22461 SelectionDAG &DAG) {
22462 MVT VT = Op.getSimpleValueType();
22463 MVT EltVT = VT.getVectorElementType();
22464 unsigned VecSize = VT.getSizeInBits();
22466 // Implement a lookup table in register by using an algorithm based on:
22467 // http://wm.ite.pl/articles/sse-popcount.html
22469 // The general idea is that every lower byte nibble in the input vector is an
22470 // index into a in-register pre-computed pop count table. We then split up the
22471 // input vector in two new ones: (1) a vector with only the shifted-right
22472 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
22473 // masked out higher ones) for each byte. PSHUB is used separately with both
22474 // to index the in-register table. Next, both are added and the result is a
22475 // i8 vector where each element contains the pop count for input byte.
22477 // To obtain the pop count for elements != i8, we follow up with the same
22478 // approach and use additional tricks as described below.
22480 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
22481 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
22482 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
22483 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
22485 int NumByteElts = VecSize / 8;
22486 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
22487 SDValue In = DAG.getBitcast(ByteVecVT, Op);
22488 SmallVector<SDValue, 64> LUTVec;
22489 for (int i = 0; i < NumByteElts; ++i)
22490 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
22491 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
22492 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
22495 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
22496 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
22499 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
22501 // The input vector is used as the shuffle mask that index elements into the
22502 // LUT. After counting low and high nibbles, add the vector to obtain the
22503 // final pop count per i8 element.
22504 SDValue HighPopCnt =
22505 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
22506 SDValue LowPopCnt =
22507 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
22508 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
22510 if (EltVT == MVT::i8)
22513 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
22516 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
22517 const X86Subtarget &Subtarget,
22518 SelectionDAG &DAG) {
22519 MVT VT = Op.getSimpleValueType();
22520 assert(VT.is128BitVector() &&
22521 "Only 128-bit vector bitmath lowering supported.");
22523 int VecSize = VT.getSizeInBits();
22524 MVT EltVT = VT.getVectorElementType();
22525 int Len = EltVT.getSizeInBits();
22527 // This is the vectorized version of the "best" algorithm from
22528 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
22529 // with a minor tweak to use a series of adds + shifts instead of vector
22530 // multiplications. Implemented for all integer vector types. We only use
22531 // this when we don't have SSSE3 which allows a LUT-based lowering that is
22532 // much faster, even faster than using native popcnt instructions.
22534 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
22535 MVT VT = V.getSimpleValueType();
22536 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
22537 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
22539 auto GetMask = [&](SDValue V, APInt Mask) {
22540 MVT VT = V.getSimpleValueType();
22541 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
22542 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
22545 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
22546 // x86, so set the SRL type to have elements at least i16 wide. This is
22547 // correct because all of our SRLs are followed immediately by a mask anyways
22548 // that handles any bits that sneak into the high bits of the byte elements.
22549 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
22553 // v = v - ((v >> 1) & 0x55555555...)
22555 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
22556 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
22557 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
22559 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
22560 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
22561 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
22562 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
22563 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
22565 // v = (v + (v >> 4)) & 0x0F0F0F0F...
22566 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
22567 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
22568 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
22570 // At this point, V contains the byte-wise population count, and we are
22571 // merely doing a horizontal sum if necessary to get the wider element
22573 if (EltVT == MVT::i8)
22576 return LowerHorizontalByteSum(
22577 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
22581 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
22582 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
22583 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22584 SelectionDAG &DAG) {
22585 MVT VT = Op.getSimpleValueType();
22586 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
22587 "Unknown CTPOP type to handle");
22588 SDLoc DL(Op.getNode());
22589 SDValue Op0 = Op.getOperand(0);
22591 if (!Subtarget.hasSSSE3()) {
22592 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
22593 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
22594 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
22597 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22598 unsigned NumElems = VT.getVectorNumElements();
22600 // Extract each 128-bit vector, compute pop count and concat the result.
22601 SDValue LHS = extract128BitVector(Op0, 0, DAG, DL);
22602 SDValue RHS = extract128BitVector(Op0, NumElems / 2, DAG, DL);
22604 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22605 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22606 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22609 if (VT.is512BitVector() && !Subtarget.hasBWI()) {
22610 unsigned NumElems = VT.getVectorNumElements();
22612 // Extract each 256-bit vector, compute pop count and concat the result.
22613 SDValue LHS = extract256BitVector(Op0, 0, DAG, DL);
22614 SDValue RHS = extract256BitVector(Op0, NumElems / 2, DAG, DL);
22616 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22617 LowerVectorCTPOPInRegLUT(LHS, DL, Subtarget, DAG),
22618 LowerVectorCTPOPInRegLUT(RHS, DL, Subtarget, DAG));
22621 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
22624 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
22625 SelectionDAG &DAG) {
22626 assert(Op.getSimpleValueType().isVector() &&
22627 "We only do custom lowering for vector population count.");
22628 return LowerVectorCTPOP(Op, Subtarget, DAG);
22631 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
22632 MVT VT = Op.getSimpleValueType();
22633 SDValue In = Op.getOperand(0);
22636 // For scalars, its still beneficial to transfer to/from the SIMD unit to
22637 // perform the BITREVERSE.
22638 if (!VT.isVector()) {
22639 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
22640 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
22641 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
22642 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
22643 DAG.getIntPtrConstant(0, DL));
22646 MVT SVT = VT.getVectorElementType();
22647 int NumElts = VT.getVectorNumElements();
22648 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
22650 // Decompose 256-bit ops into smaller 128-bit ops.
22651 if (VT.is256BitVector()) {
22652 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22653 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22655 MVT HalfVT = MVT::getVectorVT(SVT, NumElts / 2);
22656 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT,
22657 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo),
22658 DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi));
22661 assert(VT.is128BitVector() &&
22662 "Only 128-bit vector bitreverse lowering supported.");
22664 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
22665 // perform the BSWAP in the shuffle.
22666 // Its best to shuffle using the second operand as this will implicitly allow
22667 // memory folding for multiple vectors.
22668 SmallVector<SDValue, 16> MaskElts;
22669 for (int i = 0; i != NumElts; ++i) {
22670 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
22671 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
22672 int PermuteByte = SourceByte | (2 << 5);
22673 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
22677 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
22678 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
22679 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
22681 return DAG.getBitcast(VT, Res);
22684 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
22685 SelectionDAG &DAG) {
22686 if (Subtarget.hasXOP())
22687 return LowerBITREVERSE_XOP(Op, DAG);
22689 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
22691 MVT VT = Op.getSimpleValueType();
22692 SDValue In = Op.getOperand(0);
22695 unsigned NumElts = VT.getVectorNumElements();
22696 assert(VT.getScalarType() == MVT::i8 &&
22697 "Only byte vector BITREVERSE supported");
22699 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
22700 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
22701 MVT HalfVT = MVT::getVectorVT(MVT::i8, NumElts / 2);
22702 SDValue Lo = extract128BitVector(In, 0, DAG, DL);
22703 SDValue Hi = extract128BitVector(In, NumElts / 2, DAG, DL);
22704 Lo = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Lo);
22705 Hi = DAG.getNode(ISD::BITREVERSE, DL, HalfVT, Hi);
22706 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
22709 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
22710 // two nibbles and a PSHUFB lookup to find the bitreverse of each
22711 // 0-15 value (moved to the other nibble).
22712 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
22713 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
22714 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
22716 const int LoLUT[16] = {
22717 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
22718 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
22719 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
22720 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
22721 const int HiLUT[16] = {
22722 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
22723 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
22724 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
22725 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
22727 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
22728 for (unsigned i = 0; i < NumElts; ++i) {
22729 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
22730 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
22733 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
22734 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
22735 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
22736 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
22737 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
22740 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
22741 unsigned NewOpc = 0;
22742 switch (N->getOpcode()) {
22743 case ISD::ATOMIC_LOAD_ADD:
22744 NewOpc = X86ISD::LADD;
22746 case ISD::ATOMIC_LOAD_SUB:
22747 NewOpc = X86ISD::LSUB;
22749 case ISD::ATOMIC_LOAD_OR:
22750 NewOpc = X86ISD::LOR;
22752 case ISD::ATOMIC_LOAD_XOR:
22753 NewOpc = X86ISD::LXOR;
22755 case ISD::ATOMIC_LOAD_AND:
22756 NewOpc = X86ISD::LAND;
22759 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
22762 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
22763 return DAG.getMemIntrinsicNode(
22764 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
22765 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
22766 /*MemVT=*/N->getSimpleValueType(0), MMO);
22769 /// Lower atomic_load_ops into LOCK-prefixed operations.
22770 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
22771 const X86Subtarget &Subtarget) {
22772 SDValue Chain = N->getOperand(0);
22773 SDValue LHS = N->getOperand(1);
22774 SDValue RHS = N->getOperand(2);
22775 unsigned Opc = N->getOpcode();
22776 MVT VT = N->getSimpleValueType(0);
22779 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
22780 // can only be lowered when the result is unused. They should have already
22781 // been transformed into a cmpxchg loop in AtomicExpand.
22782 if (N->hasAnyUseOfValue(0)) {
22783 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
22784 // select LXADD if LOCK_SUB can't be selected.
22785 if (Opc == ISD::ATOMIC_LOAD_SUB) {
22786 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
22787 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
22788 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
22789 RHS, AN->getMemOperand());
22791 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
22792 "Used AtomicRMW ops other than Add should have been expanded!");
22796 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
22797 // RAUW the chain, but don't worry about the result, as it's unused.
22798 assert(!N->hasAnyUseOfValue(0));
22799 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
22803 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
22804 SDNode *Node = Op.getNode();
22806 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
22808 // Convert seq_cst store -> xchg
22809 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
22810 // FIXME: On 32-bit, store -> fist or movq would be more efficient
22811 // (The only way to get a 16-byte store is cmpxchg16b)
22812 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
22813 if (cast<AtomicSDNode>(Node)->getOrdering() ==
22814 AtomicOrdering::SequentiallyConsistent ||
22815 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
22816 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
22817 cast<AtomicSDNode>(Node)->getMemoryVT(),
22818 Node->getOperand(0),
22819 Node->getOperand(1), Node->getOperand(2),
22820 cast<AtomicSDNode>(Node)->getMemOperand());
22821 return Swap.getValue(1);
22823 // Other atomic stores have a simple pattern.
22827 static SDValue LowerADDC_ADDE_SUBC_SUBE(SDValue Op, SelectionDAG &DAG) {
22828 MVT VT = Op.getNode()->getSimpleValueType(0);
22830 // Let legalize expand this if it isn't a legal type yet.
22831 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
22834 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
22837 bool ExtraOp = false;
22838 switch (Op.getOpcode()) {
22839 default: llvm_unreachable("Invalid code");
22840 case ISD::ADDC: Opc = X86ISD::ADD; break;
22841 case ISD::ADDE: Opc = X86ISD::ADC; ExtraOp = true; break;
22842 case ISD::SUBC: Opc = X86ISD::SUB; break;
22843 case ISD::SUBE: Opc = X86ISD::SBB; ExtraOp = true; break;
22847 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22849 return DAG.getNode(Opc, SDLoc(Op), VTs, Op.getOperand(0),
22850 Op.getOperand(1), Op.getOperand(2));
22853 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
22854 SelectionDAG &DAG) {
22855 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
22857 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
22858 // which returns the values as { float, float } (in XMM0) or
22859 // { double, double } (which is returned in XMM0, XMM1).
22861 SDValue Arg = Op.getOperand(0);
22862 EVT ArgVT = Arg.getValueType();
22863 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
22865 TargetLowering::ArgListTy Args;
22866 TargetLowering::ArgListEntry Entry;
22870 Entry.isSExt = false;
22871 Entry.isZExt = false;
22872 Args.push_back(Entry);
22874 bool isF64 = ArgVT == MVT::f64;
22875 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
22876 // the small struct {f32, f32} is returned in (eax, edx). For f64,
22877 // the results are returned via SRet in memory.
22878 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
22879 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
22881 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
22883 Type *RetTy = isF64
22884 ? (Type*)StructType::get(ArgTy, ArgTy, nullptr)
22885 : (Type*)VectorType::get(ArgTy, 4);
22887 TargetLowering::CallLoweringInfo CLI(DAG);
22888 CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
22889 .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
22891 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
22894 // Returned in xmm0 and xmm1.
22895 return CallResult.first;
22897 // Returned in bits 0:31 and 32:64 xmm0.
22898 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22899 CallResult.first, DAG.getIntPtrConstant(0, dl));
22900 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
22901 CallResult.first, DAG.getIntPtrConstant(1, dl));
22902 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
22903 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
22906 /// Widen a vector input to a vector of NVT. The
22907 /// input vector must have the same element type as NVT.
22908 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
22909 bool FillWithZeroes = false) {
22910 // Check if InOp already has the right width.
22911 MVT InVT = InOp.getSimpleValueType();
22915 if (InOp.isUndef())
22916 return DAG.getUNDEF(NVT);
22918 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
22919 "input and widen element type must match");
22921 unsigned InNumElts = InVT.getVectorNumElements();
22922 unsigned WidenNumElts = NVT.getVectorNumElements();
22923 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
22924 "Unexpected request for vector widening");
22926 EVT EltVT = NVT.getVectorElementType();
22929 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
22930 InOp.getNumOperands() == 2) {
22931 SDValue N1 = InOp.getOperand(1);
22932 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
22934 InOp = InOp.getOperand(0);
22935 InVT = InOp.getSimpleValueType();
22936 InNumElts = InVT.getVectorNumElements();
22939 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
22940 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
22941 SmallVector<SDValue, 16> Ops;
22942 for (unsigned i = 0; i < InNumElts; ++i)
22943 Ops.push_back(InOp.getOperand(i));
22945 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
22946 DAG.getUNDEF(EltVT);
22947 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
22948 Ops.push_back(FillVal);
22949 return DAG.getBuildVector(NVT, dl, Ops);
22951 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
22953 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
22954 InOp, DAG.getIntPtrConstant(0, dl));
22957 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
22958 SelectionDAG &DAG) {
22959 assert(Subtarget.hasAVX512() &&
22960 "MGATHER/MSCATTER are supported on AVX-512 arch only");
22962 // X86 scatter kills mask register, so its type should be added to
22963 // the list of return values.
22964 // If the "scatter" has 2 return values, it is already handled.
22965 if (Op.getNode()->getNumValues() == 2)
22968 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
22969 SDValue Src = N->getValue();
22970 MVT VT = Src.getSimpleValueType();
22971 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
22974 SDValue NewScatter;
22975 SDValue Index = N->getIndex();
22976 SDValue Mask = N->getMask();
22977 SDValue Chain = N->getChain();
22978 SDValue BasePtr = N->getBasePtr();
22979 MVT MemVT = N->getMemoryVT().getSimpleVT();
22980 MVT IndexVT = Index.getSimpleValueType();
22981 MVT MaskVT = Mask.getSimpleValueType();
22983 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
22984 // The v2i32 value was promoted to v2i64.
22985 // Now we "redo" the type legalizer's work and widen the original
22986 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
22988 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
22989 "Unexpected memory type");
22990 int ShuffleMask[] = {0, 2, -1, -1};
22991 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
22992 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
22993 // Now we have 4 elements instead of 2.
22994 // Expand the index.
22995 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
22996 Index = ExtendToType(Index, NewIndexVT, DAG);
22998 // Expand the mask with zeroes
22999 // Mask may be <2 x i64> or <2 x i1> at this moment
23000 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23001 "Unexpected mask type");
23002 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23003 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23007 unsigned NumElts = VT.getVectorNumElements();
23008 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23009 !Index.getSimpleValueType().is512BitVector()) {
23010 // AVX512F supports only 512-bit vectors. Or data or index should
23011 // be 512 bit wide. If now the both index and data are 256-bit, but
23012 // the vector contains 8 elements, we just sign-extend the index
23013 if (IndexVT == MVT::v8i32)
23014 // Just extend index
23015 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23017 // The minimal number of elts in scatter is 8
23020 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23021 // Use original index here, do not modify the index twice
23022 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23023 if (IndexVT.getScalarType() == MVT::i32)
23024 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23027 // At this point we have promoted mask operand
23028 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23029 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23030 // Use the original mask here, do not modify the mask twice
23031 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23033 // The value that should be stored
23034 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23035 Src = ExtendToType(Src, NewVT, DAG);
23038 // If the mask is "wide" at this point - truncate it to i1 vector
23039 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23040 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23042 // The mask is killed by scatter, add it to the values
23043 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23044 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23045 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23046 N->getMemOperand());
23047 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23048 return SDValue(NewScatter.getNode(), 1);
23051 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23052 SelectionDAG &DAG) {
23054 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23055 MVT VT = Op.getSimpleValueType();
23056 MVT ScalarVT = VT.getScalarType();
23057 SDValue Mask = N->getMask();
23060 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23061 "Expanding masked load is supported on AVX-512 target only!");
23063 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23064 "Expanding masked load is supported for 32 and 64-bit types only!");
23066 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23067 // VLX. These types for exp-loads are handled here.
23068 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23071 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23072 "Cannot lower masked load op.");
23074 assert((ScalarVT.getSizeInBits() >= 32 ||
23075 (Subtarget.hasBWI() &&
23076 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23077 "Unsupported masked load op.");
23079 // This operation is legal for targets with VLX, but without
23080 // VLX the vector should be widened to 512 bit
23081 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23082 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23083 SDValue Src0 = N->getSrc0();
23084 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23086 // Mask element has to be i1.
23087 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23088 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23089 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
23091 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23093 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23094 if (MaskEltTy != MVT::i1)
23095 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23096 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23097 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23098 N->getBasePtr(), Mask, Src0,
23099 N->getMemoryVT(), N->getMemOperand(),
23100 N->getExtensionType(),
23101 N->isExpandingLoad());
23103 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23104 NewLoad.getValue(0),
23105 DAG.getIntPtrConstant(0, dl));
23106 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23107 return DAG.getMergeValues(RetOps, dl);
23110 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23111 SelectionDAG &DAG) {
23112 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23113 SDValue DataToStore = N->getValue();
23114 MVT VT = DataToStore.getSimpleValueType();
23115 MVT ScalarVT = VT.getScalarType();
23116 SDValue Mask = N->getMask();
23119 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23120 "Expanding masked load is supported on AVX-512 target only!");
23122 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23123 "Expanding masked load is supported for 32 and 64-bit types only!");
23125 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23126 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23129 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23130 "Cannot lower masked store op.");
23132 assert((ScalarVT.getSizeInBits() >= 32 ||
23133 (Subtarget.hasBWI() &&
23134 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23135 "Unsupported masked store op.");
23137 // This operation is legal for targets with VLX, but without
23138 // VLX the vector should be widened to 512 bit
23139 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23140 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23142 // Mask element has to be i1.
23143 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23144 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23145 "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
23147 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23149 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23150 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23151 if (MaskEltTy != MVT::i1)
23152 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23153 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23154 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23155 Mask, N->getMemoryVT(), N->getMemOperand(),
23156 N->isTruncatingStore(), N->isCompressingStore());
23159 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23160 SelectionDAG &DAG) {
23161 assert(Subtarget.hasAVX512() &&
23162 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23164 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23166 MVT VT = Op.getSimpleValueType();
23167 SDValue Index = N->getIndex();
23168 SDValue Mask = N->getMask();
23169 SDValue Src0 = N->getValue();
23170 MVT IndexVT = Index.getSimpleValueType();
23171 MVT MaskVT = Mask.getSimpleValueType();
23173 unsigned NumElts = VT.getVectorNumElements();
23174 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23176 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23177 !Index.getSimpleValueType().is512BitVector()) {
23178 // AVX512F supports only 512-bit vectors. Or data or index should
23179 // be 512 bit wide. If now the both index and data are 256-bit, but
23180 // the vector contains 8 elements, we just sign-extend the index
23181 if (NumElts == 8) {
23182 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23183 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23184 N->getOperand(3), Index };
23185 DAG.UpdateNodeOperands(N, Ops);
23189 // Minimal number of elements in Gather
23192 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23193 Index = ExtendToType(Index, NewIndexVT, DAG);
23194 if (IndexVT.getScalarType() == MVT::i32)
23195 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23198 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23199 // At this point we have promoted mask operand
23200 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23201 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23202 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23203 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23205 // The pass-thru value
23206 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23207 Src0 = ExtendToType(Src0, NewVT, DAG);
23209 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23210 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23211 N->getMemoryVT(), dl, Ops,
23212 N->getMemOperand());
23213 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23214 NewGather.getValue(0),
23215 DAG.getIntPtrConstant(0, dl));
23216 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23217 return DAG.getMergeValues(RetOps, dl);
23222 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23223 SelectionDAG &DAG) const {
23224 // TODO: Eventually, the lowering of these nodes should be informed by or
23225 // deferred to the GC strategy for the function in which they appear. For
23226 // now, however, they must be lowered to something. Since they are logically
23227 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23228 // require special handling for these nodes), lower them as literal NOOPs for
23230 SmallVector<SDValue, 2> Ops;
23232 Ops.push_back(Op.getOperand(0));
23233 if (Op->getGluedNode())
23234 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23237 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23238 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23243 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23244 SelectionDAG &DAG) const {
23245 // TODO: Eventually, the lowering of these nodes should be informed by or
23246 // deferred to the GC strategy for the function in which they appear. For
23247 // now, however, they must be lowered to something. Since they are logically
23248 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23249 // require special handling for these nodes), lower them as literal NOOPs for
23251 SmallVector<SDValue, 2> Ops;
23253 Ops.push_back(Op.getOperand(0));
23254 if (Op->getGluedNode())
23255 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23258 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23259 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23264 /// Provide custom lowering hooks for some operations.
23265 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
23266 switch (Op.getOpcode()) {
23267 default: llvm_unreachable("Should not custom lower this!");
23268 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
23269 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
23270 return LowerCMP_SWAP(Op, Subtarget, DAG);
23271 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
23272 case ISD::ATOMIC_LOAD_ADD:
23273 case ISD::ATOMIC_LOAD_SUB:
23274 case ISD::ATOMIC_LOAD_OR:
23275 case ISD::ATOMIC_LOAD_XOR:
23276 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
23277 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
23278 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
23279 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
23280 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
23281 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
23282 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
23283 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
23284 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
23285 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
23286 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
23287 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, DAG);
23288 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
23289 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
23290 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
23291 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
23292 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
23293 case ISD::SHL_PARTS:
23294 case ISD::SRA_PARTS:
23295 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
23296 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
23297 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
23298 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
23299 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
23300 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
23301 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
23302 case ISD::ZERO_EXTEND_VECTOR_INREG:
23303 case ISD::SIGN_EXTEND_VECTOR_INREG:
23304 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
23305 case ISD::FP_TO_SINT:
23306 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, Subtarget, DAG);
23307 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
23308 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
23310 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
23311 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
23312 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
23313 case ISD::SETCC: return LowerSETCC(Op, DAG);
23314 case ISD::SETCCE: return LowerSETCCE(Op, DAG);
23315 case ISD::SELECT: return LowerSELECT(Op, DAG);
23316 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
23317 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
23318 case ISD::VASTART: return LowerVASTART(Op, DAG);
23319 case ISD::VAARG: return LowerVAARG(Op, DAG);
23320 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
23321 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
23322 case ISD::INTRINSIC_VOID:
23323 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
23324 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
23325 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
23326 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
23327 case ISD::FRAME_TO_ARGS_OFFSET:
23328 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
23329 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
23330 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
23331 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
23332 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
23333 case ISD::EH_SJLJ_SETUP_DISPATCH:
23334 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
23335 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
23336 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
23337 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
23339 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
23341 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
23342 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
23344 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
23345 case ISD::UMUL_LOHI:
23346 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
23347 case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG);
23350 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
23356 case ISD::UMULO: return LowerXALUO(Op, DAG);
23357 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
23358 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
23362 case ISD::SUBE: return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
23363 case ISD::ADD: return LowerADD(Op, DAG);
23364 case ISD::SUB: return LowerSUB(Op, DAG);
23368 case ISD::UMIN: return LowerMINMAX(Op, DAG);
23369 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
23370 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
23371 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
23372 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
23373 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
23374 case ISD::GC_TRANSITION_START:
23375 return LowerGC_TRANSITION_START(Op, DAG);
23376 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
23377 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
23381 /// Places new result values for the node in Results (their number
23382 /// and types must exactly match those of the original return values of
23383 /// the node), or leaves Results empty, which indicates that the node is not
23384 /// to be custom lowered after all.
23385 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
23386 SmallVectorImpl<SDValue> &Results,
23387 SelectionDAG &DAG) const {
23388 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
23390 if (!Res.getNode())
23393 assert((N->getNumValues() <= Res->getNumValues()) &&
23394 "Lowering returned the wrong number of results!");
23396 // Places new result values base on N result number.
23397 // In some cases (LowerSINT_TO_FP for example) Res has more result values
23398 // than original node, chain should be dropped(last value).
23399 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
23400 Results.push_back(Res.getValue(I));
23403 /// Replace a node with an illegal result type with a new node built out of
23405 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
23406 SmallVectorImpl<SDValue>&Results,
23407 SelectionDAG &DAG) const {
23409 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23410 switch (N->getOpcode()) {
23412 llvm_unreachable("Do not know how to custom type legalize this operation!");
23413 case X86ISD::AVG: {
23414 // Legalize types for X86ISD::AVG by expanding vectors.
23415 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23417 auto InVT = N->getValueType(0);
23418 auto InVTSize = InVT.getSizeInBits();
23419 const unsigned RegSize =
23420 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
23421 assert((Subtarget.hasBWI() || RegSize < 512) &&
23422 "512-bit vector requires AVX512BW");
23423 assert((Subtarget.hasAVX2() || RegSize < 256) &&
23424 "256-bit vector requires AVX2");
23426 auto ElemVT = InVT.getVectorElementType();
23427 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
23428 RegSize / ElemVT.getSizeInBits());
23429 assert(RegSize % InVT.getSizeInBits() == 0);
23430 unsigned NumConcat = RegSize / InVT.getSizeInBits();
23432 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
23433 Ops[0] = N->getOperand(0);
23434 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23435 Ops[0] = N->getOperand(1);
23436 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
23438 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
23439 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
23440 DAG.getIntPtrConstant(0, dl)));
23443 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
23444 case X86ISD::FMINC:
23446 case X86ISD::FMAXC:
23447 case X86ISD::FMAX: {
23448 EVT VT = N->getValueType(0);
23449 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
23450 SDValue UNDEF = DAG.getUNDEF(VT);
23451 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23452 N->getOperand(0), UNDEF);
23453 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
23454 N->getOperand(1), UNDEF);
23455 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
23463 case ISD::UDIVREM: {
23464 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
23465 Results.push_back(V);
23468 case ISD::FP_TO_SINT:
23469 case ISD::FP_TO_UINT: {
23470 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
23472 if (N->getValueType(0) == MVT::v2i32) {
23473 assert((IsSigned || Subtarget.hasAVX512()) &&
23474 "Can only handle signed conversion without AVX512");
23475 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23476 SDValue Src = N->getOperand(0);
23477 if (Src.getValueType() == MVT::v2f64) {
23478 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23479 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
23480 : X86ISD::CVTTP2UI,
23481 dl, MVT::v4i32, Src);
23482 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23483 Results.push_back(Res);
23486 if (Src.getValueType() == MVT::v2f32) {
23487 SDValue Idx = DAG.getIntPtrConstant(0, dl);
23488 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
23489 DAG.getUNDEF(MVT::v2f32));
23490 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
23491 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
23492 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
23493 Results.push_back(Res);
23497 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
23498 // so early out here.
23502 std::pair<SDValue,SDValue> Vals =
23503 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
23504 SDValue FIST = Vals.first, StackSlot = Vals.second;
23505 if (FIST.getNode()) {
23506 EVT VT = N->getValueType(0);
23507 // Return a load from the stack slot.
23508 if (StackSlot.getNode())
23510 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
23512 Results.push_back(FIST);
23516 case ISD::SINT_TO_FP: {
23517 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
23518 SDValue Src = N->getOperand(0);
23519 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
23521 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
23524 case ISD::UINT_TO_FP: {
23525 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23526 EVT VT = N->getValueType(0);
23527 if (VT != MVT::v2f32)
23529 SDValue Src = N->getOperand(0);
23530 EVT SrcVT = Src.getValueType();
23531 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
23532 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
23535 if (SrcVT != MVT::v2i32)
23537 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
23539 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
23540 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
23541 DAG.getBitcast(MVT::v2i64, VBias));
23542 Or = DAG.getBitcast(MVT::v2f64, Or);
23543 // TODO: Are there any fast-math-flags to propagate here?
23544 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
23545 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
23548 case ISD::FP_ROUND: {
23549 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
23551 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
23552 Results.push_back(V);
23555 case ISD::FP_EXTEND: {
23556 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
23557 // No other ValueType for FP_EXTEND should reach this point.
23558 assert(N->getValueType(0) == MVT::v2f32 &&
23559 "Do not know how to legalize this Node");
23562 case ISD::INTRINSIC_W_CHAIN: {
23563 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
23565 default : llvm_unreachable("Do not know how to custom type "
23566 "legalize this intrinsic operation!");
23567 case Intrinsic::x86_rdtsc:
23568 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23570 case Intrinsic::x86_rdtscp:
23571 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
23573 case Intrinsic::x86_rdpmc:
23574 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
23576 case Intrinsic::x86_xgetbv:
23577 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
23580 case ISD::INTRINSIC_WO_CHAIN: {
23581 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
23582 Results.push_back(V);
23585 case ISD::READCYCLECOUNTER: {
23586 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
23589 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
23590 EVT T = N->getValueType(0);
23591 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
23592 bool Regs64bit = T == MVT::i128;
23593 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
23594 SDValue cpInL, cpInH;
23595 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23596 DAG.getConstant(0, dl, HalfT));
23597 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
23598 DAG.getConstant(1, dl, HalfT));
23599 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
23600 Regs64bit ? X86::RAX : X86::EAX,
23602 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
23603 Regs64bit ? X86::RDX : X86::EDX,
23604 cpInH, cpInL.getValue(1));
23605 SDValue swapInL, swapInH;
23606 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23607 DAG.getConstant(0, dl, HalfT));
23608 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
23609 DAG.getConstant(1, dl, HalfT));
23611 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
23612 swapInH, cpInH.getValue(1));
23613 // If the current function needs the base pointer, RBX,
23614 // we shouldn't use cmpxchg directly.
23615 // Indeed the lowering of that instruction will clobber
23616 // that register and since RBX will be a reserved register
23617 // the register allocator will not make sure its value will
23618 // be properly saved and restored around this live-range.
23619 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
23621 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23622 unsigned BasePtr = TRI->getBaseRegister();
23623 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
23624 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
23625 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
23626 // ISel prefers the LCMPXCHG64 variant.
23627 // If that assert breaks, that means it is not the case anymore,
23628 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
23629 // not just EBX. This is a matter of accepting i64 input for that
23630 // pseudo, and restoring into the register of the right wide
23631 // in expand pseudo. Everything else should just work.
23632 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
23633 "Saving only half of the RBX");
23634 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
23635 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
23636 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
23637 Regs64bit ? X86::RBX : X86::EBX,
23638 HalfT, swapInH.getValue(1));
23639 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
23641 /*Glue*/ RBXSave.getValue(2)};
23642 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23645 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
23646 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
23647 Regs64bit ? X86::RBX : X86::EBX, swapInL,
23648 swapInH.getValue(1));
23649 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
23650 swapInL.getValue(1)};
23651 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
23653 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
23654 Regs64bit ? X86::RAX : X86::EAX,
23655 HalfT, Result.getValue(1));
23656 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
23657 Regs64bit ? X86::RDX : X86::EDX,
23658 HalfT, cpOutL.getValue(2));
23659 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
23661 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
23662 MVT::i32, cpOutH.getValue(2));
23663 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
23664 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
23666 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
23667 Results.push_back(Success);
23668 Results.push_back(EFLAGS.getValue(1));
23671 case ISD::ATOMIC_SWAP:
23672 case ISD::ATOMIC_LOAD_ADD:
23673 case ISD::ATOMIC_LOAD_SUB:
23674 case ISD::ATOMIC_LOAD_AND:
23675 case ISD::ATOMIC_LOAD_OR:
23676 case ISD::ATOMIC_LOAD_XOR:
23677 case ISD::ATOMIC_LOAD_NAND:
23678 case ISD::ATOMIC_LOAD_MIN:
23679 case ISD::ATOMIC_LOAD_MAX:
23680 case ISD::ATOMIC_LOAD_UMIN:
23681 case ISD::ATOMIC_LOAD_UMAX:
23682 case ISD::ATOMIC_LOAD: {
23683 // Delegate to generic TypeLegalization. Situations we can really handle
23684 // should have already been dealt with by AtomicExpandPass.cpp.
23687 case ISD::BITCAST: {
23688 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23689 EVT DstVT = N->getValueType(0);
23690 EVT SrcVT = N->getOperand(0)->getValueType(0);
23692 if (SrcVT != MVT::f64 ||
23693 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
23696 unsigned NumElts = DstVT.getVectorNumElements();
23697 EVT SVT = DstVT.getVectorElementType();
23698 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23699 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
23700 MVT::v2f64, N->getOperand(0));
23701 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
23703 if (ExperimentalVectorWideningLegalization) {
23704 // If we are legalizing vectors by widening, we already have the desired
23705 // legal vector type, just return it.
23706 Results.push_back(ToVecInt);
23710 SmallVector<SDValue, 8> Elts;
23711 for (unsigned i = 0, e = NumElts; i != e; ++i)
23712 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
23713 ToVecInt, DAG.getIntPtrConstant(i, dl)));
23715 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
23720 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
23721 switch ((X86ISD::NodeType)Opcode) {
23722 case X86ISD::FIRST_NUMBER: break;
23723 case X86ISD::BSF: return "X86ISD::BSF";
23724 case X86ISD::BSR: return "X86ISD::BSR";
23725 case X86ISD::SHLD: return "X86ISD::SHLD";
23726 case X86ISD::SHRD: return "X86ISD::SHRD";
23727 case X86ISD::FAND: return "X86ISD::FAND";
23728 case X86ISD::FANDN: return "X86ISD::FANDN";
23729 case X86ISD::FOR: return "X86ISD::FOR";
23730 case X86ISD::FXOR: return "X86ISD::FXOR";
23731 case X86ISD::FILD: return "X86ISD::FILD";
23732 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
23733 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
23734 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
23735 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
23736 case X86ISD::FLD: return "X86ISD::FLD";
23737 case X86ISD::FST: return "X86ISD::FST";
23738 case X86ISD::CALL: return "X86ISD::CALL";
23739 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
23740 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
23741 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
23742 case X86ISD::BT: return "X86ISD::BT";
23743 case X86ISD::CMP: return "X86ISD::CMP";
23744 case X86ISD::COMI: return "X86ISD::COMI";
23745 case X86ISD::UCOMI: return "X86ISD::UCOMI";
23746 case X86ISD::CMPM: return "X86ISD::CMPM";
23747 case X86ISD::CMPMU: return "X86ISD::CMPMU";
23748 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
23749 case X86ISD::SETCC: return "X86ISD::SETCC";
23750 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
23751 case X86ISD::FSETCC: return "X86ISD::FSETCC";
23752 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
23753 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
23754 case X86ISD::CMOV: return "X86ISD::CMOV";
23755 case X86ISD::BRCOND: return "X86ISD::BRCOND";
23756 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
23757 case X86ISD::IRET: return "X86ISD::IRET";
23758 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
23759 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
23760 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
23761 case X86ISD::Wrapper: return "X86ISD::Wrapper";
23762 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
23763 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
23764 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
23765 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
23766 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
23767 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
23768 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
23769 case X86ISD::PINSRB: return "X86ISD::PINSRB";
23770 case X86ISD::PINSRW: return "X86ISD::PINSRW";
23771 case X86ISD::MMX_PINSRW: return "X86ISD::MMX_PINSRW";
23772 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
23773 case X86ISD::ANDNP: return "X86ISD::ANDNP";
23774 case X86ISD::BLENDI: return "X86ISD::BLENDI";
23775 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
23776 case X86ISD::ADDUS: return "X86ISD::ADDUS";
23777 case X86ISD::SUBUS: return "X86ISD::SUBUS";
23778 case X86ISD::HADD: return "X86ISD::HADD";
23779 case X86ISD::HSUB: return "X86ISD::HSUB";
23780 case X86ISD::FHADD: return "X86ISD::FHADD";
23781 case X86ISD::FHSUB: return "X86ISD::FHSUB";
23782 case X86ISD::ABS: return "X86ISD::ABS";
23783 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
23784 case X86ISD::FMAX: return "X86ISD::FMAX";
23785 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
23786 case X86ISD::FMIN: return "X86ISD::FMIN";
23787 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
23788 case X86ISD::FMAXC: return "X86ISD::FMAXC";
23789 case X86ISD::FMINC: return "X86ISD::FMINC";
23790 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
23791 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
23792 case X86ISD::FRCP: return "X86ISD::FRCP";
23793 case X86ISD::FRCPS: return "X86ISD::FRCPS";
23794 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
23795 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
23796 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
23797 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
23798 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
23799 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
23800 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
23801 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
23802 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
23803 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
23804 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
23805 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
23806 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
23807 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
23808 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
23809 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
23810 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
23811 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
23812 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
23813 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
23814 case X86ISD::LADD: return "X86ISD::LADD";
23815 case X86ISD::LSUB: return "X86ISD::LSUB";
23816 case X86ISD::LOR: return "X86ISD::LOR";
23817 case X86ISD::LXOR: return "X86ISD::LXOR";
23818 case X86ISD::LAND: return "X86ISD::LAND";
23819 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
23820 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
23821 case X86ISD::VZEXT: return "X86ISD::VZEXT";
23822 case X86ISD::VSEXT: return "X86ISD::VSEXT";
23823 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
23824 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
23825 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
23826 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
23827 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
23828 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
23829 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
23830 case X86ISD::VINSERT: return "X86ISD::VINSERT";
23831 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
23832 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
23833 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
23834 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
23835 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
23836 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
23837 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
23838 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
23839 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
23840 case X86ISD::VSHL: return "X86ISD::VSHL";
23841 case X86ISD::VSRL: return "X86ISD::VSRL";
23842 case X86ISD::VSRA: return "X86ISD::VSRA";
23843 case X86ISD::VSHLI: return "X86ISD::VSHLI";
23844 case X86ISD::VSRLI: return "X86ISD::VSRLI";
23845 case X86ISD::VSRAI: return "X86ISD::VSRAI";
23846 case X86ISD::VSRAV: return "X86ISD::VSRAV";
23847 case X86ISD::VROTLI: return "X86ISD::VROTLI";
23848 case X86ISD::VROTRI: return "X86ISD::VROTRI";
23849 case X86ISD::VPPERM: return "X86ISD::VPPERM";
23850 case X86ISD::CMPP: return "X86ISD::CMPP";
23851 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
23852 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
23853 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
23854 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
23855 case X86ISD::ADD: return "X86ISD::ADD";
23856 case X86ISD::SUB: return "X86ISD::SUB";
23857 case X86ISD::ADC: return "X86ISD::ADC";
23858 case X86ISD::SBB: return "X86ISD::SBB";
23859 case X86ISD::SMUL: return "X86ISD::SMUL";
23860 case X86ISD::UMUL: return "X86ISD::UMUL";
23861 case X86ISD::SMUL8: return "X86ISD::SMUL8";
23862 case X86ISD::UMUL8: return "X86ISD::UMUL8";
23863 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
23864 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
23865 case X86ISD::INC: return "X86ISD::INC";
23866 case X86ISD::DEC: return "X86ISD::DEC";
23867 case X86ISD::OR: return "X86ISD::OR";
23868 case X86ISD::XOR: return "X86ISD::XOR";
23869 case X86ISD::AND: return "X86ISD::AND";
23870 case X86ISD::BEXTR: return "X86ISD::BEXTR";
23871 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
23872 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
23873 case X86ISD::PTEST: return "X86ISD::PTEST";
23874 case X86ISD::TESTP: return "X86ISD::TESTP";
23875 case X86ISD::TESTM: return "X86ISD::TESTM";
23876 case X86ISD::TESTNM: return "X86ISD::TESTNM";
23877 case X86ISD::KORTEST: return "X86ISD::KORTEST";
23878 case X86ISD::KTEST: return "X86ISD::KTEST";
23879 case X86ISD::PACKSS: return "X86ISD::PACKSS";
23880 case X86ISD::PACKUS: return "X86ISD::PACKUS";
23881 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
23882 case X86ISD::VALIGN: return "X86ISD::VALIGN";
23883 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
23884 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
23885 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
23886 case X86ISD::SHUFP: return "X86ISD::SHUFP";
23887 case X86ISD::SHUF128: return "X86ISD::SHUF128";
23888 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
23889 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
23890 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
23891 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
23892 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
23893 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
23894 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
23895 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
23896 case X86ISD::MOVSD: return "X86ISD::MOVSD";
23897 case X86ISD::MOVSS: return "X86ISD::MOVSS";
23898 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
23899 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
23900 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
23901 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
23902 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
23903 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
23904 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
23905 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
23906 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
23907 case X86ISD::VPERMV: return "X86ISD::VPERMV";
23908 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
23909 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
23910 case X86ISD::VPERMI: return "X86ISD::VPERMI";
23911 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
23912 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
23913 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
23914 case X86ISD::VRANGE: return "X86ISD::VRANGE";
23915 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
23916 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
23917 case X86ISD::PSADBW: return "X86ISD::PSADBW";
23918 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
23919 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
23920 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
23921 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
23922 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
23923 case X86ISD::MFENCE: return "X86ISD::MFENCE";
23924 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
23925 case X86ISD::SAHF: return "X86ISD::SAHF";
23926 case X86ISD::RDRAND: return "X86ISD::RDRAND";
23927 case X86ISD::RDSEED: return "X86ISD::RDSEED";
23928 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
23929 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
23930 case X86ISD::VPROT: return "X86ISD::VPROT";
23931 case X86ISD::VPROTI: return "X86ISD::VPROTI";
23932 case X86ISD::VPSHA: return "X86ISD::VPSHA";
23933 case X86ISD::VPSHL: return "X86ISD::VPSHL";
23934 case X86ISD::VPCOM: return "X86ISD::VPCOM";
23935 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
23936 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
23937 case X86ISD::FMADD: return "X86ISD::FMADD";
23938 case X86ISD::FMSUB: return "X86ISD::FMSUB";
23939 case X86ISD::FNMADD: return "X86ISD::FNMADD";
23940 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
23941 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
23942 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
23943 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
23944 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
23945 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
23946 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
23947 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
23948 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
23949 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
23950 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
23951 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
23952 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
23953 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
23954 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
23955 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
23956 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
23957 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
23958 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
23959 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
23960 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
23961 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
23962 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
23963 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
23964 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
23965 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
23966 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
23967 case X86ISD::XTEST: return "X86ISD::XTEST";
23968 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
23969 case X86ISD::EXPAND: return "X86ISD::EXPAND";
23970 case X86ISD::SELECT: return "X86ISD::SELECT";
23971 case X86ISD::SELECTS: return "X86ISD::SELECTS";
23972 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
23973 case X86ISD::RCP28: return "X86ISD::RCP28";
23974 case X86ISD::RCP28S: return "X86ISD::RCP28S";
23975 case X86ISD::EXP2: return "X86ISD::EXP2";
23976 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
23977 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
23978 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
23979 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
23980 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
23981 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
23982 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
23983 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
23984 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
23985 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
23986 case X86ISD::SCALEF: return "X86ISD::SCALEF";
23987 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
23988 case X86ISD::ADDS: return "X86ISD::ADDS";
23989 case X86ISD::SUBS: return "X86ISD::SUBS";
23990 case X86ISD::AVG: return "X86ISD::AVG";
23991 case X86ISD::MULHRS: return "X86ISD::MULHRS";
23992 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
23993 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
23994 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
23995 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
23996 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
23997 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
23998 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
23999 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24000 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24001 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24002 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24003 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24004 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24005 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24006 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24007 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24008 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24009 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24010 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24011 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24012 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24013 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24014 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24019 /// Return true if the addressing mode represented by AM is legal for this
24020 /// target, for a load/store of the specified type.
24021 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24022 const AddrMode &AM, Type *Ty,
24023 unsigned AS) const {
24024 // X86 supports extremely general addressing modes.
24025 CodeModel::Model M = getTargetMachine().getCodeModel();
24027 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24028 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24032 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24034 // If a reference to this global requires an extra load, we can't fold it.
24035 if (isGlobalStubReference(GVFlags))
24038 // If BaseGV requires a register for the PIC base, we cannot also have a
24039 // BaseReg specified.
24040 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24043 // If lower 4G is not available, then we must use rip-relative addressing.
24044 if ((M != CodeModel::Small || isPositionIndependent()) &&
24045 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24049 switch (AM.Scale) {
24055 // These scales always work.
24060 // These scales are formed with basereg+scalereg. Only accept if there is
24065 default: // Other stuff never works.
24072 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24073 unsigned Bits = Ty->getScalarSizeInBits();
24075 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24076 // particularly cheaper than those without.
24080 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24081 // variable shifts just as cheap as scalar ones.
24082 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24085 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24086 // fully general vector.
24090 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24091 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24093 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24094 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24095 return NumBits1 > NumBits2;
24098 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24099 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24102 if (!isTypeLegal(EVT::getEVT(Ty1)))
24105 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24107 // Assuming the caller doesn't have a zeroext or signext return parameter,
24108 // truncation all the way down to i1 is valid.
24112 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24113 return isInt<32>(Imm);
24116 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24117 // Can also use sub to handle negated immediates.
24118 return isInt<32>(Imm);
24121 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24122 if (!VT1.isInteger() || !VT2.isInteger())
24124 unsigned NumBits1 = VT1.getSizeInBits();
24125 unsigned NumBits2 = VT2.getSizeInBits();
24126 return NumBits1 > NumBits2;
24129 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24130 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24131 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24134 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24135 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24136 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24139 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24140 EVT VT1 = Val.getValueType();
24141 if (isZExtFree(VT1, VT2))
24144 if (Val.getOpcode() != ISD::LOAD)
24147 if (!VT1.isSimple() || !VT1.isInteger() ||
24148 !VT2.isSimple() || !VT2.isInteger())
24151 switch (VT1.getSimpleVT().SimpleTy) {
24156 // X86 has 8, 16, and 32-bit zero-extending loads.
24163 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24166 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24167 if (!Subtarget.hasAnyFMA())
24170 VT = VT.getScalarType();
24172 if (!VT.isSimple())
24175 switch (VT.getSimpleVT().SimpleTy) {
24186 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24187 // i16 instructions are longer (0x66 prefix) and potentially slower.
24188 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24191 /// Targets can use this to indicate that they only support *some*
24192 /// VECTOR_SHUFFLE operations, those with specific masks.
24193 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24194 /// are assumed to be legal.
24196 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24198 if (!VT.isSimple())
24201 // Not for i1 vectors
24202 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24205 // Very little shuffling can be done for 64-bit vectors right now.
24206 if (VT.getSimpleVT().getSizeInBits() == 64)
24209 // We only care that the types being shuffled are legal. The lowering can
24210 // handle any possible shuffle mask that results.
24211 return isTypeLegal(VT.getSimpleVT());
24215 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24217 // Just delegate to the generic legality, clear masks aren't special.
24218 return isShuffleMaskLegal(Mask, VT);
24221 //===----------------------------------------------------------------------===//
24222 // X86 Scheduler Hooks
24223 //===----------------------------------------------------------------------===//
24225 /// Utility function to emit xbegin specifying the start of an RTM region.
24226 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24227 const TargetInstrInfo *TII) {
24228 DebugLoc DL = MI.getDebugLoc();
24230 const BasicBlock *BB = MBB->getBasicBlock();
24231 MachineFunction::iterator I = ++MBB->getIterator();
24233 // For the v = xbegin(), we generate
24244 MachineBasicBlock *thisMBB = MBB;
24245 MachineFunction *MF = MBB->getParent();
24246 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24247 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24248 MF->insert(I, mainMBB);
24249 MF->insert(I, sinkMBB);
24251 // Transfer the remainder of BB and its successor edges to sinkMBB.
24252 sinkMBB->splice(sinkMBB->begin(), MBB,
24253 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24254 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
24258 // # fallthrough to mainMBB
24259 // # abortion to sinkMBB
24260 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
24261 thisMBB->addSuccessor(mainMBB);
24262 thisMBB->addSuccessor(sinkMBB);
24266 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
24267 mainMBB->addSuccessor(sinkMBB);
24270 // EAX is live into the sinkMBB
24271 sinkMBB->addLiveIn(X86::EAX);
24272 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(TargetOpcode::COPY),
24273 MI.getOperand(0).getReg())
24276 MI.eraseFromParent();
24280 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
24281 // or XMM0_V32I8 in AVX all of this code can be replaced with that
24282 // in the .td file.
24283 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
24284 const TargetInstrInfo *TII) {
24286 switch (MI.getOpcode()) {
24287 default: llvm_unreachable("illegal opcode!");
24288 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
24289 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
24290 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
24291 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
24292 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
24293 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
24294 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
24295 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
24298 DebugLoc dl = MI.getDebugLoc();
24299 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24301 unsigned NumArgs = MI.getNumOperands();
24302 for (unsigned i = 1; i < NumArgs; ++i) {
24303 MachineOperand &Op = MI.getOperand(i);
24304 if (!(Op.isReg() && Op.isImplicit()))
24305 MIB.addOperand(Op);
24307 if (MI.hasOneMemOperand())
24308 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24310 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24311 .addReg(X86::XMM0);
24313 MI.eraseFromParent();
24317 // FIXME: Custom handling because TableGen doesn't support multiple implicit
24318 // defs in an instruction pattern
24319 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
24320 const TargetInstrInfo *TII) {
24322 switch (MI.getOpcode()) {
24323 default: llvm_unreachable("illegal opcode!");
24324 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
24325 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
24326 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
24327 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
24328 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
24329 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
24330 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
24331 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
24334 DebugLoc dl = MI.getDebugLoc();
24335 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
24337 unsigned NumArgs = MI.getNumOperands(); // remove the results
24338 for (unsigned i = 1; i < NumArgs; ++i) {
24339 MachineOperand &Op = MI.getOperand(i);
24340 if (!(Op.isReg() && Op.isImplicit()))
24341 MIB.addOperand(Op);
24343 if (MI.hasOneMemOperand())
24344 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
24346 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24349 MI.eraseFromParent();
24353 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24354 const X86Subtarget &Subtarget) {
24355 DebugLoc dl = MI.getDebugLoc();
24356 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24358 // insert input VAL into EAX
24359 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
24360 .addReg(MI.getOperand(0).getReg());
24361 // insert zero to ECX
24362 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24364 // insert zero to EDX
24365 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
24367 // insert WRPKRU instruction
24368 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
24370 MI.eraseFromParent(); // The pseudo is gone now.
24374 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
24375 const X86Subtarget &Subtarget) {
24376 DebugLoc dl = MI.getDebugLoc();
24377 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24379 // insert zero to ECX
24380 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
24382 // insert RDPKRU instruction
24383 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
24384 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
24387 MI.eraseFromParent(); // The pseudo is gone now.
24391 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
24392 const X86Subtarget &Subtarget,
24394 DebugLoc dl = MI.getDebugLoc();
24395 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24396 // Address into RAX/EAX, other two args into ECX, EDX.
24397 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
24398 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
24399 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
24400 for (int i = 0; i < X86::AddrNumOperands; ++i)
24401 MIB.addOperand(MI.getOperand(i));
24403 unsigned ValOps = X86::AddrNumOperands;
24404 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
24405 .addReg(MI.getOperand(ValOps).getReg());
24406 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
24407 .addReg(MI.getOperand(ValOps + 1).getReg());
24409 // The instruction doesn't actually take any operands though.
24410 BuildMI(*BB, MI, dl, TII->get(Opc));
24412 MI.eraseFromParent(); // The pseudo is gone now.
24416 MachineBasicBlock *
24417 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
24418 MachineBasicBlock *MBB) const {
24419 // Emit va_arg instruction on X86-64.
24421 // Operands to this pseudo-instruction:
24422 // 0 ) Output : destination address (reg)
24423 // 1-5) Input : va_list address (addr, i64mem)
24424 // 6 ) ArgSize : Size (in bytes) of vararg type
24425 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
24426 // 8 ) Align : Alignment of type
24427 // 9 ) EFLAGS (implicit-def)
24429 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
24430 static_assert(X86::AddrNumOperands == 5,
24431 "VAARG_64 assumes 5 address operands");
24433 unsigned DestReg = MI.getOperand(0).getReg();
24434 MachineOperand &Base = MI.getOperand(1);
24435 MachineOperand &Scale = MI.getOperand(2);
24436 MachineOperand &Index = MI.getOperand(3);
24437 MachineOperand &Disp = MI.getOperand(4);
24438 MachineOperand &Segment = MI.getOperand(5);
24439 unsigned ArgSize = MI.getOperand(6).getImm();
24440 unsigned ArgMode = MI.getOperand(7).getImm();
24441 unsigned Align = MI.getOperand(8).getImm();
24443 // Memory Reference
24444 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
24445 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
24446 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
24448 // Machine Information
24449 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24450 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
24451 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
24452 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
24453 DebugLoc DL = MI.getDebugLoc();
24455 // struct va_list {
24458 // i64 overflow_area (address)
24459 // i64 reg_save_area (address)
24461 // sizeof(va_list) = 24
24462 // alignment(va_list) = 8
24464 unsigned TotalNumIntRegs = 6;
24465 unsigned TotalNumXMMRegs = 8;
24466 bool UseGPOffset = (ArgMode == 1);
24467 bool UseFPOffset = (ArgMode == 2);
24468 unsigned MaxOffset = TotalNumIntRegs * 8 +
24469 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
24471 /* Align ArgSize to a multiple of 8 */
24472 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
24473 bool NeedsAlign = (Align > 8);
24475 MachineBasicBlock *thisMBB = MBB;
24476 MachineBasicBlock *overflowMBB;
24477 MachineBasicBlock *offsetMBB;
24478 MachineBasicBlock *endMBB;
24480 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
24481 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
24482 unsigned OffsetReg = 0;
24484 if (!UseGPOffset && !UseFPOffset) {
24485 // If we only pull from the overflow region, we don't create a branch.
24486 // We don't need to alter control flow.
24487 OffsetDestReg = 0; // unused
24488 OverflowDestReg = DestReg;
24490 offsetMBB = nullptr;
24491 overflowMBB = thisMBB;
24494 // First emit code to check if gp_offset (or fp_offset) is below the bound.
24495 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
24496 // If not, pull from overflow_area. (branch to overflowMBB)
24501 // offsetMBB overflowMBB
24506 // Registers for the PHI in endMBB
24507 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
24508 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
24510 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24511 MachineFunction *MF = MBB->getParent();
24512 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24513 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24514 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
24516 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24518 // Insert the new basic blocks
24519 MF->insert(MBBIter, offsetMBB);
24520 MF->insert(MBBIter, overflowMBB);
24521 MF->insert(MBBIter, endMBB);
24523 // Transfer the remainder of MBB and its successor edges to endMBB.
24524 endMBB->splice(endMBB->begin(), thisMBB,
24525 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
24526 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
24528 // Make offsetMBB and overflowMBB successors of thisMBB
24529 thisMBB->addSuccessor(offsetMBB);
24530 thisMBB->addSuccessor(overflowMBB);
24532 // endMBB is a successor of both offsetMBB and overflowMBB
24533 offsetMBB->addSuccessor(endMBB);
24534 overflowMBB->addSuccessor(endMBB);
24536 // Load the offset value into a register
24537 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24538 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
24542 .addDisp(Disp, UseFPOffset ? 4 : 0)
24543 .addOperand(Segment)
24544 .setMemRefs(MMOBegin, MMOEnd);
24546 // Check if there is enough room left to pull this argument.
24547 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
24549 .addImm(MaxOffset + 8 - ArgSizeA8);
24551 // Branch to "overflowMBB" if offset >= max
24552 // Fall through to "offsetMBB" otherwise
24553 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
24554 .addMBB(overflowMBB);
24557 // In offsetMBB, emit code to use the reg_save_area.
24559 assert(OffsetReg != 0);
24561 // Read the reg_save_area address.
24562 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
24563 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
24568 .addOperand(Segment)
24569 .setMemRefs(MMOBegin, MMOEnd);
24571 // Zero-extend the offset
24572 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
24573 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
24576 .addImm(X86::sub_32bit);
24578 // Add the offset to the reg_save_area to get the final address.
24579 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
24580 .addReg(OffsetReg64)
24581 .addReg(RegSaveReg);
24583 // Compute the offset for the next argument
24584 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
24585 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
24587 .addImm(UseFPOffset ? 16 : 8);
24589 // Store it back into the va_list.
24590 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
24594 .addDisp(Disp, UseFPOffset ? 4 : 0)
24595 .addOperand(Segment)
24596 .addReg(NextOffsetReg)
24597 .setMemRefs(MMOBegin, MMOEnd);
24600 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
24605 // Emit code to use overflow area
24608 // Load the overflow_area address into a register.
24609 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
24610 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
24615 .addOperand(Segment)
24616 .setMemRefs(MMOBegin, MMOEnd);
24618 // If we need to align it, do so. Otherwise, just copy the address
24619 // to OverflowDestReg.
24621 // Align the overflow address
24622 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
24623 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
24625 // aligned_addr = (addr + (align-1)) & ~(align-1)
24626 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
24627 .addReg(OverflowAddrReg)
24630 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
24632 .addImm(~(uint64_t)(Align-1));
24634 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
24635 .addReg(OverflowAddrReg);
24638 // Compute the next overflow address after this argument.
24639 // (the overflow address should be kept 8-byte aligned)
24640 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
24641 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
24642 .addReg(OverflowDestReg)
24643 .addImm(ArgSizeA8);
24645 // Store the new overflow address.
24646 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
24651 .addOperand(Segment)
24652 .addReg(NextAddrReg)
24653 .setMemRefs(MMOBegin, MMOEnd);
24655 // If we branched, emit the PHI to the front of endMBB.
24657 BuildMI(*endMBB, endMBB->begin(), DL,
24658 TII->get(X86::PHI), DestReg)
24659 .addReg(OffsetDestReg).addMBB(offsetMBB)
24660 .addReg(OverflowDestReg).addMBB(overflowMBB);
24663 // Erase the pseudo instruction
24664 MI.eraseFromParent();
24669 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
24670 MachineInstr &MI, MachineBasicBlock *MBB) const {
24671 // Emit code to save XMM registers to the stack. The ABI says that the
24672 // number of registers to save is given in %al, so it's theoretically
24673 // possible to do an indirect jump trick to avoid saving all of them,
24674 // however this code takes a simpler approach and just executes all
24675 // of the stores if %al is non-zero. It's less code, and it's probably
24676 // easier on the hardware branch predictor, and stores aren't all that
24677 // expensive anyway.
24679 // Create the new basic blocks. One block contains all the XMM stores,
24680 // and one block is the final destination regardless of whether any
24681 // stores were performed.
24682 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
24683 MachineFunction *F = MBB->getParent();
24684 MachineFunction::iterator MBBIter = ++MBB->getIterator();
24685 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
24686 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
24687 F->insert(MBBIter, XMMSaveMBB);
24688 F->insert(MBBIter, EndMBB);
24690 // Transfer the remainder of MBB and its successor edges to EndMBB.
24691 EndMBB->splice(EndMBB->begin(), MBB,
24692 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
24693 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
24695 // The original block will now fall through to the XMM save block.
24696 MBB->addSuccessor(XMMSaveMBB);
24697 // The XMMSaveMBB will fall through to the end block.
24698 XMMSaveMBB->addSuccessor(EndMBB);
24700 // Now add the instructions.
24701 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24702 DebugLoc DL = MI.getDebugLoc();
24704 unsigned CountReg = MI.getOperand(0).getReg();
24705 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
24706 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
24708 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
24709 // If %al is 0, branch around the XMM save block.
24710 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
24711 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
24712 MBB->addSuccessor(EndMBB);
24715 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
24716 // that was just emitted, but clearly shouldn't be "saved".
24717 assert((MI.getNumOperands() <= 3 ||
24718 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
24719 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
24720 "Expected last argument to be EFLAGS");
24721 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
24722 // In the XMM save block, save all the XMM argument registers.
24723 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
24724 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
24725 MachineMemOperand *MMO = F->getMachineMemOperand(
24726 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
24727 MachineMemOperand::MOStore,
24728 /*Size=*/16, /*Align=*/16);
24729 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
24730 .addFrameIndex(RegSaveFrameIndex)
24731 .addImm(/*Scale=*/1)
24732 .addReg(/*IndexReg=*/0)
24733 .addImm(/*Disp=*/Offset)
24734 .addReg(/*Segment=*/0)
24735 .addReg(MI.getOperand(i).getReg())
24736 .addMemOperand(MMO);
24739 MI.eraseFromParent(); // The pseudo instruction is gone now.
24744 // The EFLAGS operand of SelectItr might be missing a kill marker
24745 // because there were multiple uses of EFLAGS, and ISel didn't know
24746 // which to mark. Figure out whether SelectItr should have had a
24747 // kill marker, and set it if it should. Returns the correct kill
24749 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
24750 MachineBasicBlock* BB,
24751 const TargetRegisterInfo* TRI) {
24752 // Scan forward through BB for a use/def of EFLAGS.
24753 MachineBasicBlock::iterator miI(std::next(SelectItr));
24754 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
24755 const MachineInstr& mi = *miI;
24756 if (mi.readsRegister(X86::EFLAGS))
24758 if (mi.definesRegister(X86::EFLAGS))
24759 break; // Should have kill-flag - update below.
24762 // If we hit the end of the block, check whether EFLAGS is live into a
24764 if (miI == BB->end()) {
24765 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
24766 sEnd = BB->succ_end();
24767 sItr != sEnd; ++sItr) {
24768 MachineBasicBlock* succ = *sItr;
24769 if (succ->isLiveIn(X86::EFLAGS))
24774 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
24775 // out. SelectMI should have a kill flag on EFLAGS.
24776 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
24780 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
24781 // together with other CMOV pseudo-opcodes into a single basic-block with
24782 // conditional jump around it.
24783 static bool isCMOVPseudo(MachineInstr &MI) {
24784 switch (MI.getOpcode()) {
24785 case X86::CMOV_FR32:
24786 case X86::CMOV_FR64:
24787 case X86::CMOV_GR8:
24788 case X86::CMOV_GR16:
24789 case X86::CMOV_GR32:
24790 case X86::CMOV_RFP32:
24791 case X86::CMOV_RFP64:
24792 case X86::CMOV_RFP80:
24793 case X86::CMOV_V2F64:
24794 case X86::CMOV_V2I64:
24795 case X86::CMOV_V4F32:
24796 case X86::CMOV_V4F64:
24797 case X86::CMOV_V4I64:
24798 case X86::CMOV_V16F32:
24799 case X86::CMOV_V8F32:
24800 case X86::CMOV_V8F64:
24801 case X86::CMOV_V8I64:
24802 case X86::CMOV_V8I1:
24803 case X86::CMOV_V16I1:
24804 case X86::CMOV_V32I1:
24805 case X86::CMOV_V64I1:
24813 MachineBasicBlock *
24814 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
24815 MachineBasicBlock *BB) const {
24816 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
24817 DebugLoc DL = MI.getDebugLoc();
24819 // To "insert" a SELECT_CC instruction, we actually have to insert the
24820 // diamond control-flow pattern. The incoming instruction knows the
24821 // destination vreg to set, the condition code register to branch on, the
24822 // true/false values to select between, and a branch opcode to use.
24823 const BasicBlock *LLVM_BB = BB->getBasicBlock();
24824 MachineFunction::iterator It = ++BB->getIterator();
24829 // cmpTY ccX, r1, r2
24831 // fallthrough --> copy0MBB
24832 MachineBasicBlock *thisMBB = BB;
24833 MachineFunction *F = BB->getParent();
24835 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
24836 // as described above, by inserting a BB, and then making a PHI at the join
24837 // point to select the true and false operands of the CMOV in the PHI.
24839 // The code also handles two different cases of multiple CMOV opcodes
24843 // In this case, there are multiple CMOVs in a row, all which are based on
24844 // the same condition setting (or the exact opposite condition setting).
24845 // In this case we can lower all the CMOVs using a single inserted BB, and
24846 // then make a number of PHIs at the join point to model the CMOVs. The only
24847 // trickiness here, is that in a case like:
24849 // t2 = CMOV cond1 t1, f1
24850 // t3 = CMOV cond1 t2, f2
24852 // when rewriting this into PHIs, we have to perform some renaming on the
24853 // temps since you cannot have a PHI operand refer to a PHI result earlier
24854 // in the same block. The "simple" but wrong lowering would be:
24856 // t2 = PHI t1(BB1), f1(BB2)
24857 // t3 = PHI t2(BB1), f2(BB2)
24859 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
24860 // renaming is to note that on the path through BB1, t2 is really just a
24861 // copy of t1, and do that renaming, properly generating:
24863 // t2 = PHI t1(BB1), f1(BB2)
24864 // t3 = PHI t1(BB1), f2(BB2)
24866 // Case 2, we lower cascaded CMOVs such as
24868 // (CMOV (CMOV F, T, cc1), T, cc2)
24870 // to two successives branches. For that, we look for another CMOV as the
24871 // following instruction.
24873 // Without this, we would add a PHI between the two jumps, which ends up
24874 // creating a few copies all around. For instance, for
24876 // (sitofp (zext (fcmp une)))
24878 // we would generate:
24880 // ucomiss %xmm1, %xmm0
24881 // movss <1.0f>, %xmm0
24882 // movaps %xmm0, %xmm1
24884 // xorps %xmm1, %xmm1
24887 // movaps %xmm1, %xmm0
24891 // because this custom-inserter would have generated:
24903 // A: X = ...; Y = ...
24905 // C: Z = PHI [X, A], [Y, B]
24907 // E: PHI [X, C], [Z, D]
24909 // If we lower both CMOVs in a single step, we can instead generate:
24921 // A: X = ...; Y = ...
24923 // E: PHI [X, A], [X, C], [Y, D]
24925 // Which, in our sitofp/fcmp example, gives us something like:
24927 // ucomiss %xmm1, %xmm0
24928 // movss <1.0f>, %xmm0
24931 // xorps %xmm0, %xmm0
24935 MachineInstr *CascadedCMOV = nullptr;
24936 MachineInstr *LastCMOV = &MI;
24937 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
24938 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
24939 MachineBasicBlock::iterator NextMIIt =
24940 std::next(MachineBasicBlock::iterator(MI));
24942 // Check for case 1, where there are multiple CMOVs with the same condition
24943 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
24944 // number of jumps the most.
24946 if (isCMOVPseudo(MI)) {
24947 // See if we have a string of CMOVS with the same condition.
24948 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
24949 (NextMIIt->getOperand(3).getImm() == CC ||
24950 NextMIIt->getOperand(3).getImm() == OppCC)) {
24951 LastCMOV = &*NextMIIt;
24956 // This checks for case 2, but only do this if we didn't already find
24957 // case 1, as indicated by LastCMOV == MI.
24958 if (LastCMOV == &MI && NextMIIt != BB->end() &&
24959 NextMIIt->getOpcode() == MI.getOpcode() &&
24960 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
24961 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
24962 NextMIIt->getOperand(1).isKill()) {
24963 CascadedCMOV = &*NextMIIt;
24966 MachineBasicBlock *jcc1MBB = nullptr;
24968 // If we have a cascaded CMOV, we lower it to two successive branches to
24969 // the same block. EFLAGS is used by both, so mark it as live in the second.
24970 if (CascadedCMOV) {
24971 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
24972 F->insert(It, jcc1MBB);
24973 jcc1MBB->addLiveIn(X86::EFLAGS);
24976 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
24977 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
24978 F->insert(It, copy0MBB);
24979 F->insert(It, sinkMBB);
24981 // If the EFLAGS register isn't dead in the terminator, then claim that it's
24982 // live into the sink and copy blocks.
24983 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
24985 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
24986 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
24987 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
24988 copy0MBB->addLiveIn(X86::EFLAGS);
24989 sinkMBB->addLiveIn(X86::EFLAGS);
24992 // Transfer the remainder of BB and its successor edges to sinkMBB.
24993 sinkMBB->splice(sinkMBB->begin(), BB,
24994 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
24995 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
24997 // Add the true and fallthrough blocks as its successors.
24998 if (CascadedCMOV) {
24999 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25000 BB->addSuccessor(jcc1MBB);
25002 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25003 // jump to the sinkMBB.
25004 jcc1MBB->addSuccessor(copy0MBB);
25005 jcc1MBB->addSuccessor(sinkMBB);
25007 BB->addSuccessor(copy0MBB);
25010 // The true block target of the first (or only) branch is always sinkMBB.
25011 BB->addSuccessor(sinkMBB);
25013 // Create the conditional branch instruction.
25014 unsigned Opc = X86::GetCondBranchFromCond(CC);
25015 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25017 if (CascadedCMOV) {
25018 unsigned Opc2 = X86::GetCondBranchFromCond(
25019 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25020 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25024 // %FalseValue = ...
25025 // # fallthrough to sinkMBB
25026 copy0MBB->addSuccessor(sinkMBB);
25029 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25031 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25032 MachineBasicBlock::iterator MIItEnd =
25033 std::next(MachineBasicBlock::iterator(LastCMOV));
25034 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25035 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25036 MachineInstrBuilder MIB;
25038 // As we are creating the PHIs, we have to be careful if there is more than
25039 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25040 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25041 // That also means that PHI construction must work forward from earlier to
25042 // later, and that the code must maintain a mapping from earlier PHI's
25043 // destination registers, and the registers that went into the PHI.
25045 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25046 unsigned DestReg = MIIt->getOperand(0).getReg();
25047 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25048 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25050 // If this CMOV we are generating is the opposite condition from
25051 // the jump we generated, then we have to swap the operands for the
25052 // PHI that is going to be generated.
25053 if (MIIt->getOperand(3).getImm() == OppCC)
25054 std::swap(Op1Reg, Op2Reg);
25056 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25057 Op1Reg = RegRewriteTable[Op1Reg].first;
25059 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25060 Op2Reg = RegRewriteTable[Op2Reg].second;
25062 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25063 TII->get(X86::PHI), DestReg)
25064 .addReg(Op1Reg).addMBB(copy0MBB)
25065 .addReg(Op2Reg).addMBB(thisMBB);
25067 // Add this PHI to the rewrite table.
25068 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25071 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25072 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25073 if (CascadedCMOV) {
25074 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25075 // Copy the PHI result to the register defined by the second CMOV.
25076 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25077 DL, TII->get(TargetOpcode::COPY),
25078 CascadedCMOV->getOperand(0).getReg())
25079 .addReg(MI.getOperand(0).getReg());
25080 CascadedCMOV->eraseFromParent();
25083 // Now remove the CMOV(s).
25084 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25085 (MIIt++)->eraseFromParent();
25090 MachineBasicBlock *
25091 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25092 MachineBasicBlock *BB) const {
25093 // Combine the following atomic floating-point modification pattern:
25094 // a.store(reg OP a.load(acquire), release)
25095 // Transform them into:
25096 // OPss (%gpr), %xmm
25097 // movss %xmm, (%gpr)
25098 // Or sd equivalent for 64-bit operations.
25100 switch (MI.getOpcode()) {
25101 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25102 case X86::RELEASE_FADD32mr:
25103 FOp = X86::ADDSSrm;
25104 MOp = X86::MOVSSmr;
25106 case X86::RELEASE_FADD64mr:
25107 FOp = X86::ADDSDrm;
25108 MOp = X86::MOVSDmr;
25111 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25112 DebugLoc DL = MI.getDebugLoc();
25113 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25114 unsigned ValOpIdx = X86::AddrNumOperands;
25115 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25116 MachineInstrBuilder MIB =
25117 BuildMI(*BB, MI, DL, TII->get(FOp),
25118 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25120 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25121 MachineOperand &Operand = MI.getOperand(i);
25122 // Clear any kill flags on register operands as we'll create a second
25123 // instruction using the same address operands.
25124 if (Operand.isReg())
25125 Operand.setIsKill(false);
25126 MIB.addOperand(Operand);
25128 MachineInstr *FOpMI = MIB;
25129 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25130 for (int i = 0; i < X86::AddrNumOperands; ++i)
25131 MIB.addOperand(MI.getOperand(i));
25132 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25133 MI.eraseFromParent(); // The pseudo instruction is gone now.
25137 MachineBasicBlock *
25138 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25139 MachineBasicBlock *BB) const {
25140 MachineFunction *MF = BB->getParent();
25141 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25142 DebugLoc DL = MI.getDebugLoc();
25143 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25145 assert(MF->shouldSplitStack());
25147 const bool Is64Bit = Subtarget.is64Bit();
25148 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25150 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25151 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25154 // ... [Till the alloca]
25155 // If stacklet is not large enough, jump to mallocMBB
25158 // Allocate by subtracting from RSP
25159 // Jump to continueMBB
25162 // Allocate by call to runtime
25166 // [rest of original BB]
25169 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25170 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25171 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25173 MachineRegisterInfo &MRI = MF->getRegInfo();
25174 const TargetRegisterClass *AddrRegClass =
25175 getRegClassFor(getPointerTy(MF->getDataLayout()));
25177 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25178 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25179 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25180 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25181 sizeVReg = MI.getOperand(1).getReg(),
25183 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25185 MachineFunction::iterator MBBIter = ++BB->getIterator();
25187 MF->insert(MBBIter, bumpMBB);
25188 MF->insert(MBBIter, mallocMBB);
25189 MF->insert(MBBIter, continueMBB);
25191 continueMBB->splice(continueMBB->begin(), BB,
25192 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25193 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25195 // Add code to the main basic block to check if the stack limit has been hit,
25196 // and if so, jump to mallocMBB otherwise to bumpMBB.
25197 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25198 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25199 .addReg(tmpSPVReg).addReg(sizeVReg);
25200 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25201 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25202 .addReg(SPLimitVReg);
25203 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25205 // bumpMBB simply decreases the stack pointer, since we know the current
25206 // stacklet has enough space.
25207 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25208 .addReg(SPLimitVReg);
25209 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25210 .addReg(SPLimitVReg);
25211 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25213 // Calls into a routine in libgcc to allocate more space from the heap.
25214 const uint32_t *RegMask =
25215 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
25217 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
25219 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25220 .addExternalSymbol("__morestack_allocate_stack_space")
25221 .addRegMask(RegMask)
25222 .addReg(X86::RDI, RegState::Implicit)
25223 .addReg(X86::RAX, RegState::ImplicitDefine);
25224 } else if (Is64Bit) {
25225 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
25227 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
25228 .addExternalSymbol("__morestack_allocate_stack_space")
25229 .addRegMask(RegMask)
25230 .addReg(X86::EDI, RegState::Implicit)
25231 .addReg(X86::EAX, RegState::ImplicitDefine);
25233 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
25235 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
25236 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
25237 .addExternalSymbol("__morestack_allocate_stack_space")
25238 .addRegMask(RegMask)
25239 .addReg(X86::EAX, RegState::ImplicitDefine);
25243 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
25246 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
25247 .addReg(IsLP64 ? X86::RAX : X86::EAX);
25248 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25250 // Set up the CFG correctly.
25251 BB->addSuccessor(bumpMBB);
25252 BB->addSuccessor(mallocMBB);
25253 mallocMBB->addSuccessor(continueMBB);
25254 bumpMBB->addSuccessor(continueMBB);
25256 // Take care of the PHI nodes.
25257 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
25258 MI.getOperand(0).getReg())
25259 .addReg(mallocPtrVReg)
25261 .addReg(bumpSPPtrVReg)
25264 // Delete the original pseudo instruction.
25265 MI.eraseFromParent();
25268 return continueMBB;
25271 MachineBasicBlock *
25272 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
25273 MachineBasicBlock *BB) const {
25274 MachineFunction *MF = BB->getParent();
25275 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25276 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
25277 DebugLoc DL = MI.getDebugLoc();
25279 assert(!isAsynchronousEHPersonality(
25280 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
25281 "SEH does not use catchret!");
25283 // Only 32-bit EH needs to worry about manually restoring stack pointers.
25284 if (!Subtarget.is32Bit())
25287 // C++ EH creates a new target block to hold the restore code, and wires up
25288 // the new block to the return destination with a normal JMP_4.
25289 MachineBasicBlock *RestoreMBB =
25290 MF->CreateMachineBasicBlock(BB->getBasicBlock());
25291 assert(BB->succ_size() == 1);
25292 MF->insert(std::next(BB->getIterator()), RestoreMBB);
25293 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
25294 BB->addSuccessor(RestoreMBB);
25295 MI.getOperand(0).setMBB(RestoreMBB);
25297 auto RestoreMBBI = RestoreMBB->begin();
25298 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
25299 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
25303 MachineBasicBlock *
25304 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
25305 MachineBasicBlock *BB) const {
25306 MachineFunction *MF = BB->getParent();
25307 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
25308 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
25309 // Only 32-bit SEH requires special handling for catchpad.
25310 if (IsSEH && Subtarget.is32Bit()) {
25311 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25312 DebugLoc DL = MI.getDebugLoc();
25313 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
25315 MI.eraseFromParent();
25319 MachineBasicBlock *
25320 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
25321 MachineBasicBlock *BB) const {
25322 // So, here we replace TLSADDR with the sequence:
25323 // adjust_stackdown -> TLSADDR -> adjust_stackup.
25324 // We need this because TLSADDR is lowered into calls
25325 // inside MC, therefore without the two markers shrink-wrapping
25326 // may push the prologue/epilogue pass them.
25327 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
25328 DebugLoc DL = MI.getDebugLoc();
25329 MachineFunction &MF = *BB->getParent();
25331 // Emit CALLSEQ_START right before the instruction.
25332 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
25333 MachineInstrBuilder CallseqStart =
25334 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0);
25335 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
25337 // Emit CALLSEQ_END right after the instruction.
25338 // We don't call erase from parent because we want to keep the
25339 // original instruction around.
25340 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
25341 MachineInstrBuilder CallseqEnd =
25342 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
25343 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
25348 MachineBasicBlock *
25349 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
25350 MachineBasicBlock *BB) const {
25351 // This is pretty easy. We're taking the value that we received from
25352 // our load from the relocation, sticking it in either RDI (x86-64)
25353 // or EAX and doing an indirect call. The return value will then
25354 // be in the normal return register.
25355 MachineFunction *F = BB->getParent();
25356 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25357 DebugLoc DL = MI.getDebugLoc();
25359 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
25360 assert(MI.getOperand(3).isGlobal() && "This should be a global");
25362 // Get a register mask for the lowered call.
25363 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
25364 // proper register mask.
25365 const uint32_t *RegMask =
25366 Subtarget.is64Bit() ?
25367 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
25368 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
25369 if (Subtarget.is64Bit()) {
25370 MachineInstrBuilder MIB =
25371 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
25375 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25376 MI.getOperand(3).getTargetFlags())
25378 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
25379 addDirectMem(MIB, X86::RDI);
25380 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
25381 } else if (!isPositionIndependent()) {
25382 MachineInstrBuilder MIB =
25383 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25387 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25388 MI.getOperand(3).getTargetFlags())
25390 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25391 addDirectMem(MIB, X86::EAX);
25392 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25394 MachineInstrBuilder MIB =
25395 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
25396 .addReg(TII->getGlobalBaseReg(F))
25399 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
25400 MI.getOperand(3).getTargetFlags())
25402 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
25403 addDirectMem(MIB, X86::EAX);
25404 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
25407 MI.eraseFromParent(); // The pseudo instruction is gone now.
25411 MachineBasicBlock *
25412 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
25413 MachineBasicBlock *MBB) const {
25414 DebugLoc DL = MI.getDebugLoc();
25415 MachineFunction *MF = MBB->getParent();
25416 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25417 MachineRegisterInfo &MRI = MF->getRegInfo();
25419 const BasicBlock *BB = MBB->getBasicBlock();
25420 MachineFunction::iterator I = ++MBB->getIterator();
25422 // Memory Reference
25423 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25424 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25427 unsigned MemOpndSlot = 0;
25429 unsigned CurOp = 0;
25431 DstReg = MI.getOperand(CurOp++).getReg();
25432 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25433 assert(RC->hasType(MVT::i32) && "Invalid destination!");
25434 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25435 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
25437 MemOpndSlot = CurOp;
25439 MVT PVT = getPointerTy(MF->getDataLayout());
25440 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25441 "Invalid Pointer Size!");
25443 // For v = setjmp(buf), we generate
25446 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
25447 // SjLjSetup restoreMBB
25453 // v = phi(main, restore)
25456 // if base pointer being used, load it from frame
25459 MachineBasicBlock *thisMBB = MBB;
25460 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25461 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25462 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
25463 MF->insert(I, mainMBB);
25464 MF->insert(I, sinkMBB);
25465 MF->push_back(restoreMBB);
25466 restoreMBB->setHasAddressTaken();
25468 MachineInstrBuilder MIB;
25470 // Transfer the remainder of BB and its successor edges to sinkMBB.
25471 sinkMBB->splice(sinkMBB->begin(), MBB,
25472 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25473 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25476 unsigned PtrStoreOpc = 0;
25477 unsigned LabelReg = 0;
25478 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25479 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25480 !isPositionIndependent();
25482 // Prepare IP either in reg or imm.
25483 if (!UseImmLabel) {
25484 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25485 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
25486 LabelReg = MRI.createVirtualRegister(PtrRC);
25487 if (Subtarget.is64Bit()) {
25488 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
25492 .addMBB(restoreMBB)
25495 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
25496 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
25497 .addReg(XII->getGlobalBaseReg(MF))
25500 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
25504 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25506 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
25507 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25508 if (i == X86::AddrDisp)
25509 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
25511 MIB.addOperand(MI.getOperand(MemOpndSlot + i));
25514 MIB.addReg(LabelReg);
25516 MIB.addMBB(restoreMBB);
25517 MIB.setMemRefs(MMOBegin, MMOEnd);
25519 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
25520 .addMBB(restoreMBB);
25522 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25523 MIB.addRegMask(RegInfo->getNoPreservedMask());
25524 thisMBB->addSuccessor(mainMBB);
25525 thisMBB->addSuccessor(restoreMBB);
25529 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
25530 mainMBB->addSuccessor(sinkMBB);
25533 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
25534 TII->get(X86::PHI), DstReg)
25535 .addReg(mainDstReg).addMBB(mainMBB)
25536 .addReg(restoreDstReg).addMBB(restoreMBB);
25539 if (RegInfo->hasBasePointer(*MF)) {
25540 const bool Uses64BitFramePtr =
25541 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25542 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
25543 X86FI->setRestoreBasePointer(MF);
25544 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
25545 unsigned BasePtr = RegInfo->getBaseRegister();
25546 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
25547 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
25548 FramePtr, true, X86FI->getRestoreBasePointerOffset())
25549 .setMIFlag(MachineInstr::FrameSetup);
25551 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
25552 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25553 restoreMBB->addSuccessor(sinkMBB);
25555 MI.eraseFromParent();
25559 MachineBasicBlock *
25560 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
25561 MachineBasicBlock *MBB) const {
25562 DebugLoc DL = MI.getDebugLoc();
25563 MachineFunction *MF = MBB->getParent();
25564 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25565 MachineRegisterInfo &MRI = MF->getRegInfo();
25567 // Memory Reference
25568 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25569 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25571 MVT PVT = getPointerTy(MF->getDataLayout());
25572 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
25573 "Invalid Pointer Size!");
25575 const TargetRegisterClass *RC =
25576 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25577 unsigned Tmp = MRI.createVirtualRegister(RC);
25578 // Since FP is only updated here but NOT referenced, it's treated as GPR.
25579 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
25580 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
25581 unsigned SP = RegInfo->getStackRegister();
25583 MachineInstrBuilder MIB;
25585 const int64_t LabelOffset = 1 * PVT.getStoreSize();
25586 const int64_t SPOffset = 2 * PVT.getStoreSize();
25588 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
25589 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
25592 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
25593 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
25594 MIB.addOperand(MI.getOperand(i));
25595 MIB.setMemRefs(MMOBegin, MMOEnd);
25597 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
25598 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25599 if (i == X86::AddrDisp)
25600 MIB.addDisp(MI.getOperand(i), LabelOffset);
25602 MIB.addOperand(MI.getOperand(i));
25604 MIB.setMemRefs(MMOBegin, MMOEnd);
25606 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
25607 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
25608 if (i == X86::AddrDisp)
25609 MIB.addDisp(MI.getOperand(i), SPOffset);
25611 MIB.addOperand(MI.getOperand(i));
25613 MIB.setMemRefs(MMOBegin, MMOEnd);
25615 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
25617 MI.eraseFromParent();
25621 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
25622 MachineBasicBlock *MBB,
25623 MachineBasicBlock *DispatchBB,
25625 DebugLoc DL = MI.getDebugLoc();
25626 MachineFunction *MF = MBB->getParent();
25627 MachineRegisterInfo *MRI = &MF->getRegInfo();
25628 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25630 MVT PVT = getPointerTy(MF->getDataLayout());
25631 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
25636 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
25637 !isPositionIndependent();
25640 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
25642 const TargetRegisterClass *TRC =
25643 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
25644 VR = MRI->createVirtualRegister(TRC);
25645 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
25647 /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
25649 if (Subtarget.is64Bit())
25650 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
25654 .addMBB(DispatchBB)
25657 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
25658 .addReg(0) /* XII->getGlobalBaseReg(MF) */
25661 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
25665 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
25666 addFrameReference(MIB, FI, 36);
25668 MIB.addMBB(DispatchBB);
25673 MachineBasicBlock *
25674 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
25675 MachineBasicBlock *BB) const {
25676 DebugLoc DL = MI.getDebugLoc();
25677 MachineFunction *MF = BB->getParent();
25678 MachineFrameInfo &MFI = MF->getFrameInfo();
25679 MachineRegisterInfo *MRI = &MF->getRegInfo();
25680 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25681 int FI = MFI.getFunctionContextIndex();
25683 // Get a mapping of the call site numbers to all of the landing pads they're
25684 // associated with.
25685 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
25686 unsigned MaxCSNum = 0;
25687 for (auto &MBB : *MF) {
25688 if (!MBB.isEHPad())
25691 MCSymbol *Sym = nullptr;
25692 for (const auto &MI : MBB) {
25693 if (MI.isDebugValue())
25696 assert(MI.isEHLabel() && "expected EH_LABEL");
25697 Sym = MI.getOperand(0).getMCSymbol();
25701 if (!MF->hasCallSiteLandingPad(Sym))
25704 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
25705 CallSiteNumToLPad[CSI].push_back(&MBB);
25706 MaxCSNum = std::max(MaxCSNum, CSI);
25710 // Get an ordered list of the machine basic blocks for the jump table.
25711 std::vector<MachineBasicBlock *> LPadList;
25712 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
25713 LPadList.reserve(CallSiteNumToLPad.size());
25715 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
25716 for (auto &LP : CallSiteNumToLPad[CSI]) {
25717 LPadList.push_back(LP);
25718 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
25722 assert(!LPadList.empty() &&
25723 "No landing pad destinations for the dispatch jump table!");
25725 // Create the MBBs for the dispatch code.
25727 // Shove the dispatch's address into the return slot in the function context.
25728 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
25729 DispatchBB->setIsEHPad(true);
25731 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
25732 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
25733 DispatchBB->addSuccessor(TrapBB);
25735 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
25736 DispatchBB->addSuccessor(DispContBB);
25739 MF->push_back(DispatchBB);
25740 MF->push_back(DispContBB);
25741 MF->push_back(TrapBB);
25743 // Insert code into the entry block that creates and registers the function
25745 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
25747 // Create the jump table and associated information
25748 MachineJumpTableInfo *JTI =
25749 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
25750 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
25752 const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
25753 const X86RegisterInfo &RI = XII->getRegisterInfo();
25755 // Add a register mask with no preserved registers. This results in all
25756 // registers being marked as clobbered.
25757 if (RI.hasBasePointer(*MF)) {
25758 const bool FPIs64Bit =
25759 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
25760 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
25761 MFI->setRestoreBasePointer(MF);
25763 unsigned FP = RI.getFrameRegister(*MF);
25764 unsigned BP = RI.getBaseRegister();
25765 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
25766 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
25767 MFI->getRestoreBasePointerOffset())
25768 .addRegMask(RI.getNoPreservedMask());
25770 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
25771 .addRegMask(RI.getNoPreservedMask());
25774 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25775 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
25777 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
25779 .addImm(LPadList.size());
25780 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
25782 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
25783 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
25786 BuildMI(DispContBB, DL,
25787 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
25789 .addImm(Subtarget.is64Bit() ? 8 : 4)
25791 .addJumpTableIndex(MJTI)
25794 // Add the jump table entries as successors to the MBB.
25795 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
25796 for (auto &LP : LPadList)
25797 if (SeenMBBs.insert(LP).second)
25798 DispContBB->addSuccessor(LP);
25800 // N.B. the order the invoke BBs are processed in doesn't matter here.
25801 SmallVector<MachineBasicBlock *, 64> MBBLPads;
25802 const MCPhysReg *SavedRegs =
25803 Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
25804 for (MachineBasicBlock *MBB : InvokeBBs) {
25805 // Remove the landing pad successor from the invoke block and replace it
25806 // with the new dispatch block.
25807 // Keep a copy of Successors since it's modified inside the loop.
25808 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
25810 // FIXME: Avoid quadratic complexity.
25811 for (auto MBBS : Successors) {
25812 if (MBBS->isEHPad()) {
25813 MBB->removeSuccessor(MBBS);
25814 MBBLPads.push_back(MBBS);
25818 MBB->addSuccessor(DispatchBB);
25820 // Find the invoke call and mark all of the callee-saved registers as
25821 // 'implicit defined' so that they're spilled. This prevents code from
25822 // moving instructions to before the EH block, where they will never be
25824 for (auto &II : reverse(*MBB)) {
25828 DenseMap<unsigned, bool> DefRegs;
25829 for (auto &MOp : II.operands())
25831 DefRegs[MOp.getReg()] = true;
25833 MachineInstrBuilder MIB(*MF, &II);
25834 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
25835 unsigned Reg = SavedRegs[RI];
25837 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
25844 // Mark all former landing pads as non-landing pads. The dispatch is the only
25845 // landing pad now.
25846 for (auto &LP : MBBLPads)
25847 LP->setIsEHPad(false);
25849 // The instruction is gone now.
25850 MI.eraseFromParent();
25854 MachineBasicBlock *
25855 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
25856 MachineBasicBlock *BB) const {
25857 MachineFunction *MF = BB->getParent();
25858 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25859 DebugLoc DL = MI.getDebugLoc();
25861 switch (MI.getOpcode()) {
25862 default: llvm_unreachable("Unexpected instr type to insert");
25863 case X86::TAILJMPd64:
25864 case X86::TAILJMPr64:
25865 case X86::TAILJMPm64:
25866 case X86::TAILJMPr64_REX:
25867 case X86::TAILJMPm64_REX:
25868 llvm_unreachable("TAILJMP64 would not be touched here.");
25869 case X86::TCRETURNdi64:
25870 case X86::TCRETURNri64:
25871 case X86::TCRETURNmi64:
25873 case X86::TLS_addr32:
25874 case X86::TLS_addr64:
25875 case X86::TLS_base_addr32:
25876 case X86::TLS_base_addr64:
25877 return EmitLoweredTLSAddr(MI, BB);
25878 case X86::CATCHRET:
25879 return EmitLoweredCatchRet(MI, BB);
25880 case X86::CATCHPAD:
25881 return EmitLoweredCatchPad(MI, BB);
25882 case X86::SEG_ALLOCA_32:
25883 case X86::SEG_ALLOCA_64:
25884 return EmitLoweredSegAlloca(MI, BB);
25885 case X86::TLSCall_32:
25886 case X86::TLSCall_64:
25887 return EmitLoweredTLSCall(MI, BB);
25888 case X86::CMOV_FR32:
25889 case X86::CMOV_FR64:
25890 case X86::CMOV_FR128:
25891 case X86::CMOV_GR8:
25892 case X86::CMOV_GR16:
25893 case X86::CMOV_GR32:
25894 case X86::CMOV_RFP32:
25895 case X86::CMOV_RFP64:
25896 case X86::CMOV_RFP80:
25897 case X86::CMOV_V2F64:
25898 case X86::CMOV_V2I64:
25899 case X86::CMOV_V4F32:
25900 case X86::CMOV_V4F64:
25901 case X86::CMOV_V4I64:
25902 case X86::CMOV_V16F32:
25903 case X86::CMOV_V8F32:
25904 case X86::CMOV_V8F64:
25905 case X86::CMOV_V8I64:
25906 case X86::CMOV_V8I1:
25907 case X86::CMOV_V16I1:
25908 case X86::CMOV_V32I1:
25909 case X86::CMOV_V64I1:
25910 return EmitLoweredSelect(MI, BB);
25912 case X86::RDFLAGS32:
25913 case X86::RDFLAGS64: {
25915 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
25916 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
25917 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
25918 // Permit reads of the FLAGS register without it being defined.
25919 // This intrinsic exists to read external processor state in flags, such as
25920 // the trap flag, interrupt flag, and direction flag, none of which are
25921 // modeled by the backend.
25922 Push->getOperand(2).setIsUndef();
25923 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
25925 MI.eraseFromParent(); // The pseudo is gone now.
25929 case X86::WRFLAGS32:
25930 case X86::WRFLAGS64: {
25932 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
25934 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
25935 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
25936 BuildMI(*BB, MI, DL, TII->get(PopF));
25938 MI.eraseFromParent(); // The pseudo is gone now.
25942 case X86::RELEASE_FADD32mr:
25943 case X86::RELEASE_FADD64mr:
25944 return EmitLoweredAtomicFP(MI, BB);
25946 case X86::FP32_TO_INT16_IN_MEM:
25947 case X86::FP32_TO_INT32_IN_MEM:
25948 case X86::FP32_TO_INT64_IN_MEM:
25949 case X86::FP64_TO_INT16_IN_MEM:
25950 case X86::FP64_TO_INT32_IN_MEM:
25951 case X86::FP64_TO_INT64_IN_MEM:
25952 case X86::FP80_TO_INT16_IN_MEM:
25953 case X86::FP80_TO_INT32_IN_MEM:
25954 case X86::FP80_TO_INT64_IN_MEM: {
25955 // Change the floating point control register to use "round towards zero"
25956 // mode when truncating to an integer value.
25957 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
25958 addFrameReference(BuildMI(*BB, MI, DL,
25959 TII->get(X86::FNSTCW16m)), CWFrameIdx);
25961 // Load the old value of the high byte of the control word...
25963 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
25964 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
25967 // Set the high part to be round to zero...
25968 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
25971 // Reload the modified control word now...
25972 addFrameReference(BuildMI(*BB, MI, DL,
25973 TII->get(X86::FLDCW16m)), CWFrameIdx);
25975 // Restore the memory image of control word to original value
25976 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
25979 // Get the X86 opcode to use.
25981 switch (MI.getOpcode()) {
25982 default: llvm_unreachable("illegal opcode!");
25983 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
25984 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
25985 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
25986 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
25987 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
25988 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
25989 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
25990 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
25991 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
25994 X86AddressMode AM = getAddressFromInstr(&MI, 0);
25995 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
25996 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
25998 // Reload the original control word now.
25999 addFrameReference(BuildMI(*BB, MI, DL,
26000 TII->get(X86::FLDCW16m)), CWFrameIdx);
26002 MI.eraseFromParent(); // The pseudo instruction is gone now.
26005 // String/text processing lowering.
26006 case X86::PCMPISTRM128REG:
26007 case X86::VPCMPISTRM128REG:
26008 case X86::PCMPISTRM128MEM:
26009 case X86::VPCMPISTRM128MEM:
26010 case X86::PCMPESTRM128REG:
26011 case X86::VPCMPESTRM128REG:
26012 case X86::PCMPESTRM128MEM:
26013 case X86::VPCMPESTRM128MEM:
26014 assert(Subtarget.hasSSE42() &&
26015 "Target must have SSE4.2 or AVX features enabled");
26016 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26018 // String/text processing lowering.
26019 case X86::PCMPISTRIREG:
26020 case X86::VPCMPISTRIREG:
26021 case X86::PCMPISTRIMEM:
26022 case X86::VPCMPISTRIMEM:
26023 case X86::PCMPESTRIREG:
26024 case X86::VPCMPESTRIREG:
26025 case X86::PCMPESTRIMEM:
26026 case X86::VPCMPESTRIMEM:
26027 assert(Subtarget.hasSSE42() &&
26028 "Target must have SSE4.2 or AVX features enabled");
26029 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26031 // Thread synchronization.
26033 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26034 case X86::MONITORX:
26035 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26038 return emitWRPKRU(MI, BB, Subtarget);
26040 return emitRDPKRU(MI, BB, Subtarget);
26043 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26045 case X86::VASTART_SAVE_XMM_REGS:
26046 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26048 case X86::VAARG_64:
26049 return EmitVAARG64WithCustomInserter(MI, BB);
26051 case X86::EH_SjLj_SetJmp32:
26052 case X86::EH_SjLj_SetJmp64:
26053 return emitEHSjLjSetJmp(MI, BB);
26055 case X86::EH_SjLj_LongJmp32:
26056 case X86::EH_SjLj_LongJmp64:
26057 return emitEHSjLjLongJmp(MI, BB);
26059 case X86::Int_eh_sjlj_setup_dispatch:
26060 return EmitSjLjDispatchBlock(MI, BB);
26062 case TargetOpcode::STATEPOINT:
26063 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26064 // this point in the process. We diverge later.
26065 return emitPatchPoint(MI, BB);
26067 case TargetOpcode::STACKMAP:
26068 case TargetOpcode::PATCHPOINT:
26069 return emitPatchPoint(MI, BB);
26071 case X86::LCMPXCHG8B: {
26072 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26073 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26074 // requires a memory operand. If it happens that current architecture is
26075 // i686 and for current function we need a base pointer
26076 // - which is ESI for i686 - register allocator would not be able to
26077 // allocate registers for an address in form of X(%reg, %reg, Y)
26078 // - there never would be enough unreserved registers during regalloc
26079 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26080 // We are giving a hand to register allocator by precomputing the address in
26081 // a new vreg using LEA.
26083 // If it is not i686 or there is no base pointer - nothing to do here.
26084 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26087 // Even though this code does not necessarily needs the base pointer to
26088 // be ESI, we check for that. The reason: if this assert fails, there are
26089 // some changes happened in the compiler base pointer handling, which most
26090 // probably have to be addressed somehow here.
26091 assert(TRI->getBaseRegister() == X86::ESI &&
26092 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26093 "base pointer in mind");
26095 MachineRegisterInfo &MRI = MF->getRegInfo();
26096 MVT SPTy = getPointerTy(MF->getDataLayout());
26097 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26098 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26100 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26101 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26102 // does not use index register.
26103 if (AM.IndexReg == X86::NoRegister)
26106 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26107 // four operand definitions that are E[ABCD] registers. We skip them and
26108 // then insert the LEA.
26109 MachineBasicBlock::iterator MBBI(MI);
26110 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26111 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26114 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26116 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26120 case X86::LCMPXCHG16B:
26122 case X86::LCMPXCHG8B_SAVE_EBX:
26123 case X86::LCMPXCHG16B_SAVE_RBX: {
26125 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26126 if (!BB->isLiveIn(BasePtr))
26127 BB->addLiveIn(BasePtr);
26133 //===----------------------------------------------------------------------===//
26134 // X86 Optimization Hooks
26135 //===----------------------------------------------------------------------===//
26137 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26140 const SelectionDAG &DAG,
26141 unsigned Depth) const {
26142 unsigned BitWidth = KnownZero.getBitWidth();
26143 unsigned Opc = Op.getOpcode();
26144 assert((Opc >= ISD::BUILTIN_OP_END ||
26145 Opc == ISD::INTRINSIC_WO_CHAIN ||
26146 Opc == ISD::INTRINSIC_W_CHAIN ||
26147 Opc == ISD::INTRINSIC_VOID) &&
26148 "Should use MaskedValueIsZero if you don't know whether Op"
26149 " is a target node!");
26151 KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
26165 // These nodes' second result is a boolean.
26166 if (Op.getResNo() == 0)
26169 case X86ISD::SETCC:
26170 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
26172 case X86ISD::MOVMSK: {
26173 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26174 KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
26177 case X86ISD::VZEXT: {
26178 SDValue N0 = Op.getOperand(0);
26179 unsigned NumElts = Op.getValueType().getVectorNumElements();
26180 unsigned InNumElts = N0.getValueType().getVectorNumElements();
26181 unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
26183 KnownZero = KnownOne = APInt(InBitWidth, 0);
26184 APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
26185 DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
26186 KnownOne = KnownOne.zext(BitWidth);
26187 KnownZero = KnownZero.zext(BitWidth);
26188 KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
26194 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
26195 SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
26196 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
26197 if (Op.getOpcode() == X86ISD::SETCC_CARRY)
26198 return Op.getScalarValueSizeInBits();
26200 if (Op.getOpcode() == X86ISD::VSEXT) {
26201 EVT VT = Op.getValueType();
26202 EVT SrcVT = Op.getOperand(0).getValueType();
26203 unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
26204 Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
26212 /// Returns true (and the GlobalValue and the offset) if the node is a
26213 /// GlobalAddress + offset.
26214 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
26215 const GlobalValue* &GA,
26216 int64_t &Offset) const {
26217 if (N->getOpcode() == X86ISD::Wrapper) {
26218 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
26219 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
26220 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
26224 return TargetLowering::isGAPlusOffset(N, GA, Offset);
26227 // Attempt to match a combined shuffle mask against supported unary shuffle
26229 // TODO: Investigate sharing more of this with shuffle lowering.
26230 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26232 const X86Subtarget &Subtarget,
26233 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
26234 unsigned NumMaskElts = Mask.size();
26235 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
26237 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
26238 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
26239 isUndefOrEqual(Mask[0], 0) &&
26240 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
26241 Shuffle = X86ISD::VZEXT_MOVL;
26242 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
26246 // Match against a VZEXT instruction.
26247 // TODO: Add 256/512-bit vector support.
26248 if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
26249 unsigned MaxScale = 64 / MaskEltSize;
26250 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
26252 unsigned NumDstElts = NumMaskElts / Scale;
26253 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
26254 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
26255 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
26259 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
26260 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
26261 Shuffle = X86ISD::VZEXT;
26267 // Check if we have SSE3 which will let us use MOVDDUP etc. The
26268 // instructions are no slower than UNPCKLPD but has the option to
26269 // fold the input operand into even an unaligned memory load.
26270 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
26271 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
26272 Shuffle = X86ISD::MOVDDUP;
26273 SrcVT = DstVT = MVT::v2f64;
26276 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26277 Shuffle = X86ISD::MOVSLDUP;
26278 SrcVT = DstVT = MVT::v4f32;
26281 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
26282 Shuffle = X86ISD::MOVSHDUP;
26283 SrcVT = DstVT = MVT::v4f32;
26288 if (MaskVT.is256BitVector() && FloatDomain) {
26289 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
26290 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
26291 Shuffle = X86ISD::MOVDDUP;
26292 SrcVT = DstVT = MVT::v4f64;
26295 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26296 Shuffle = X86ISD::MOVSLDUP;
26297 SrcVT = DstVT = MVT::v8f32;
26300 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
26301 Shuffle = X86ISD::MOVSHDUP;
26302 SrcVT = DstVT = MVT::v8f32;
26307 if (MaskVT.is512BitVector() && FloatDomain) {
26308 assert(Subtarget.hasAVX512() &&
26309 "AVX512 required for 512-bit vector shuffles");
26310 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
26311 Shuffle = X86ISD::MOVDDUP;
26312 SrcVT = DstVT = MVT::v8f64;
26315 if (isTargetShuffleEquivalent(
26316 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
26317 Shuffle = X86ISD::MOVSLDUP;
26318 SrcVT = DstVT = MVT::v16f32;
26321 if (isTargetShuffleEquivalent(
26322 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
26323 Shuffle = X86ISD::MOVSHDUP;
26324 SrcVT = DstVT = MVT::v16f32;
26329 // Attempt to match against broadcast-from-vector.
26330 if (Subtarget.hasAVX2()) {
26331 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
26332 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
26333 SrcVT = DstVT = MaskVT;
26334 Shuffle = X86ISD::VBROADCAST;
26342 // Attempt to match a combined shuffle mask against supported unary immediate
26343 // permute instructions.
26344 // TODO: Investigate sharing more of this with shuffle lowering.
26345 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26347 const X86Subtarget &Subtarget,
26348 unsigned &Shuffle, MVT &ShuffleVT,
26349 unsigned &PermuteImm) {
26350 unsigned NumMaskElts = Mask.size();
26352 bool ContainsZeros = false;
26353 SmallBitVector Zeroable(NumMaskElts, false);
26354 for (unsigned i = 0; i != NumMaskElts; ++i) {
26356 Zeroable[i] = isUndefOrZero(M);
26357 ContainsZeros |= (M == SM_SentinelZero);
26360 // Attempt to match against byte/bit shifts.
26361 // FIXME: Add 512-bit support.
26362 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26363 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26364 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
26365 MaskVT.getScalarSizeInBits(), Mask,
26366 0, Zeroable, Subtarget);
26367 if (0 < ShiftAmt) {
26368 PermuteImm = (unsigned)ShiftAmt;
26373 // Ensure we don't contain any zero elements.
26377 assert(llvm::all_of(Mask, [&](int M) {
26378 return SM_SentinelUndef <= M && M < (int)NumMaskElts;
26379 }) && "Expected unary shuffle");
26381 unsigned InputSizeInBits = MaskVT.getSizeInBits();
26382 unsigned MaskScalarSizeInBits = InputSizeInBits / Mask.size();
26383 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
26385 // Handle PSHUFLW/PSHUFHW repeated patterns.
26386 if (MaskScalarSizeInBits == 16) {
26387 SmallVector<int, 4> RepeatedMask;
26388 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
26389 ArrayRef<int> LoMask(Mask.data() + 0, 4);
26390 ArrayRef<int> HiMask(Mask.data() + 4, 4);
26392 // PSHUFLW: permute lower 4 elements only.
26393 if (isUndefOrInRange(LoMask, 0, 4) &&
26394 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
26395 Shuffle = X86ISD::PSHUFLW;
26396 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26397 PermuteImm = getV4X86ShuffleImm(LoMask);
26401 // PSHUFHW: permute upper 4 elements only.
26402 if (isUndefOrInRange(HiMask, 4, 8) &&
26403 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
26404 // Offset the HiMask so that we can create the shuffle immediate.
26405 int OffsetHiMask[4];
26406 for (int i = 0; i != 4; ++i)
26407 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
26409 Shuffle = X86ISD::PSHUFHW;
26410 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
26411 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
26420 // We only support permutation of 32/64 bit elements after this.
26421 if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64)
26424 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
26425 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
26426 if (FloatDomain && !Subtarget.hasAVX())
26429 // Pre-AVX2 we must use float shuffles on 256-bit vectors.
26430 if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
26431 FloatDomain = true;
26433 // Check for lane crossing permutes.
26434 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
26435 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
26436 if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
26437 Shuffle = X86ISD::VPERMI;
26438 ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26439 PermuteImm = getV4X86ShuffleImm(Mask);
26442 if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) {
26443 SmallVector<int, 4> RepeatedMask;
26444 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
26445 Shuffle = X86ISD::VPERMI;
26446 ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
26447 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
26454 // VPERMILPD can permute with a non-repeating shuffle.
26455 if (FloatDomain && MaskScalarSizeInBits == 64) {
26456 Shuffle = X86ISD::VPERMILPI;
26457 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
26459 for (int i = 0, e = Mask.size(); i != e; ++i) {
26461 if (M == SM_SentinelUndef)
26463 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
26464 PermuteImm |= (M & 1) << i;
26469 // We need a repeating shuffle mask for VPERMILPS/PSHUFD.
26470 SmallVector<int, 4> RepeatedMask;
26471 if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask))
26474 // Narrow the repeated mask for 32-bit element permutes.
26475 SmallVector<int, 4> WordMask = RepeatedMask;
26476 if (MaskScalarSizeInBits == 64)
26477 scaleShuffleMask(2, RepeatedMask, WordMask);
26479 Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
26480 ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
26481 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
26482 PermuteImm = getV4X86ShuffleImm(WordMask);
26486 // Attempt to match a combined unary shuffle mask against supported binary
26487 // shuffle instructions.
26488 // TODO: Investigate sharing more of this with shuffle lowering.
26489 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26490 bool FloatDomain, SDValue &V1, SDValue &V2,
26491 const X86Subtarget &Subtarget,
26492 unsigned &Shuffle, MVT &ShuffleVT,
26494 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
26496 if (MaskVT.is128BitVector()) {
26497 if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
26499 Shuffle = X86ISD::MOVLHPS;
26500 ShuffleVT = MVT::v4f32;
26503 if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
26505 Shuffle = X86ISD::MOVHLPS;
26506 ShuffleVT = MVT::v4f32;
26509 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
26510 (FloatDomain || !Subtarget.hasSSE41())) {
26512 Shuffle = X86ISD::MOVSD;
26513 ShuffleVT = MaskVT;
26516 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
26517 (FloatDomain || !Subtarget.hasSSE41())) {
26518 Shuffle = X86ISD::MOVSS;
26519 ShuffleVT = MaskVT;
26524 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
26525 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26526 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
26527 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
26528 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
26529 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
26530 MVT LegalVT = MaskVT;
26531 if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
26532 LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
26534 SmallVector<int, 64> Unpckl, Unpckh;
26536 createUnpackShuffleMask(MaskVT, Unpckl, true, true);
26537 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26539 Shuffle = X86ISD::UNPCKL;
26540 ShuffleVT = LegalVT;
26544 createUnpackShuffleMask(MaskVT, Unpckh, false, true);
26545 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26547 Shuffle = X86ISD::UNPCKH;
26548 ShuffleVT = LegalVT;
26552 createUnpackShuffleMask(MaskVT, Unpckl, true, false);
26553 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26554 Shuffle = X86ISD::UNPCKL;
26555 ShuffleVT = LegalVT;
26559 createUnpackShuffleMask(MaskVT, Unpckh, false, false);
26560 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26561 Shuffle = X86ISD::UNPCKH;
26562 ShuffleVT = LegalVT;
26566 ShuffleVectorSDNode::commuteMask(Unpckl);
26567 if (isTargetShuffleEquivalent(Mask, Unpckl)) {
26569 Shuffle = X86ISD::UNPCKL;
26570 ShuffleVT = LegalVT;
26574 ShuffleVectorSDNode::commuteMask(Unpckh);
26575 if (isTargetShuffleEquivalent(Mask, Unpckh)) {
26577 Shuffle = X86ISD::UNPCKH;
26578 ShuffleVT = LegalVT;
26587 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
26589 SDValue &V1, SDValue &V2,
26590 SDLoc &DL, SelectionDAG &DAG,
26591 const X86Subtarget &Subtarget,
26592 unsigned &Shuffle, MVT &ShuffleVT,
26593 unsigned &PermuteImm) {
26594 unsigned NumMaskElts = Mask.size();
26596 // Attempt to match against PALIGNR byte rotate.
26597 if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
26598 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
26599 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
26600 if (0 < ByteRotation) {
26601 Shuffle = X86ISD::PALIGNR;
26602 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
26603 PermuteImm = ByteRotation;
26608 // Attempt to combine to X86ISD::BLENDI.
26609 if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
26610 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
26611 // Determine a type compatible with X86ISD::BLENDI.
26612 // TODO - add 16i16 support (requires lane duplication).
26613 MVT BlendVT = MaskVT;
26614 if (Subtarget.hasAVX2()) {
26615 if (BlendVT == MVT::v4i64)
26616 BlendVT = MVT::v8i32;
26617 else if (BlendVT == MVT::v2i64)
26618 BlendVT = MVT::v4i32;
26620 if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
26621 BlendVT = MVT::v8i16;
26622 else if (BlendVT == MVT::v4i64)
26623 BlendVT = MVT::v4f64;
26624 else if (BlendVT == MVT::v8i32)
26625 BlendVT = MVT::v8f32;
26628 unsigned BlendSize = BlendVT.getVectorNumElements();
26629 unsigned MaskRatio = BlendSize / NumMaskElts;
26631 // Can we blend with zero?
26632 if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
26634 NumMaskElts <= BlendVT.getVectorNumElements()) {
26636 for (unsigned i = 0; i != BlendSize; ++i)
26637 if (Mask[i / MaskRatio] < 0)
26638 PermuteImm |= 1u << i;
26640 V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
26641 Shuffle = X86ISD::BLENDI;
26642 ShuffleVT = BlendVT;
26646 // Attempt to match as a binary blend.
26647 if (NumMaskElts <= BlendVT.getVectorNumElements()) {
26648 bool MatchBlend = true;
26649 for (int i = 0; i != (int)NumMaskElts; ++i) {
26651 if (M == SM_SentinelUndef)
26653 else if (M == SM_SentinelZero)
26654 MatchBlend = false;
26655 else if ((M != i) && (M != (i + (int)NumMaskElts)))
26656 MatchBlend = false;
26661 for (unsigned i = 0; i != BlendSize; ++i)
26662 if ((int)NumMaskElts <= Mask[i / MaskRatio])
26663 PermuteImm |= 1u << i;
26665 Shuffle = X86ISD::BLENDI;
26666 ShuffleVT = BlendVT;
26672 // Attempt to combine to INSERTPS.
26673 if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
26674 SmallBitVector Zeroable(4, false);
26675 for (unsigned i = 0; i != NumMaskElts; ++i)
26677 Zeroable[i] = true;
26679 if (Zeroable.any() &&
26680 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
26681 Shuffle = X86ISD::INSERTPS;
26682 ShuffleVT = MVT::v4f32;
26687 // Attempt to combine to SHUFPD.
26688 if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
26689 (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
26690 (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
26691 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
26692 Shuffle = X86ISD::SHUFP;
26693 ShuffleVT = MaskVT;
26698 // Attempt to combine to SHUFPS.
26699 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
26700 (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
26701 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
26702 SmallVector<int, 4> RepeatedMask;
26703 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
26704 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
26705 int M0 = RepeatedMask[Offset];
26706 int M1 = RepeatedMask[Offset + 1];
26708 if (isUndefInRange(RepeatedMask, Offset, 2)) {
26709 return DAG.getUNDEF(MaskVT);
26710 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
26711 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
26712 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
26713 return getZeroVector(MaskVT, Subtarget, DAG, DL);
26714 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
26715 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26716 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26718 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
26719 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
26720 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
26727 int ShufMask[4] = {-1, -1, -1, -1};
26728 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
26729 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
26734 Shuffle = X86ISD::SHUFP;
26735 ShuffleVT = MaskVT;
26736 PermuteImm = getV4X86ShuffleImm(ShufMask);
26745 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
26748 /// This is the leaf of the recursive combine below. When we have found some
26749 /// chain of single-use x86 shuffle instructions and accumulated the combined
26750 /// shuffle mask represented by them, this will try to pattern match that mask
26751 /// into either a single instruction if there is a special purpose instruction
26752 /// for this operation, or into a PSHUFB instruction which is a fully general
26753 /// instruction but should only be used to replace chains over a certain depth.
26754 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
26755 ArrayRef<int> BaseMask, int Depth,
26756 bool HasVariableMask, SelectionDAG &DAG,
26757 TargetLowering::DAGCombinerInfo &DCI,
26758 const X86Subtarget &Subtarget) {
26759 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
26760 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
26761 "Unexpected number of shuffle inputs!");
26763 // Find the inputs that enter the chain. Note that multiple uses are OK
26764 // here, we're not going to remove the operands we find.
26765 bool UnaryShuffle = (Inputs.size() == 1);
26766 SDValue V1 = peekThroughBitcasts(Inputs[0]);
26767 SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
26769 MVT VT1 = V1.getSimpleValueType();
26770 MVT VT2 = V2.getSimpleValueType();
26771 MVT RootVT = Root.getSimpleValueType();
26772 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
26773 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
26774 "Vector size mismatch");
26779 unsigned NumBaseMaskElts = BaseMask.size();
26780 if (NumBaseMaskElts == 1) {
26781 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
26782 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26787 unsigned RootSizeInBits = RootVT.getSizeInBits();
26788 unsigned NumRootElts = RootVT.getVectorNumElements();
26789 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
26790 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
26791 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
26793 // Don't combine if we are a AVX512/EVEX target and the mask element size
26794 // is different from the root element size - this would prevent writemasks
26795 // from being reused.
26796 // TODO - this currently prevents all lane shuffles from occurring.
26797 // TODO - check for writemasks usage instead of always preventing combining.
26798 // TODO - attempt to narrow Mask back to writemask size.
26799 bool IsEVEXShuffle =
26800 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
26801 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
26804 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
26806 // Handle 128-bit lane shuffles of 256-bit vectors.
26807 // TODO - this should support binary shuffles.
26808 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
26809 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
26810 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
26811 return false; // Nothing to do!
26812 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
26813 unsigned PermMask = 0;
26814 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
26815 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
26817 Res = DAG.getBitcast(ShuffleVT, V1);
26818 DCI.AddToWorklist(Res.getNode());
26819 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
26820 DAG.getUNDEF(ShuffleVT),
26821 DAG.getConstant(PermMask, DL, MVT::i8));
26822 DCI.AddToWorklist(Res.getNode());
26823 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26828 // For masks that have been widened to 128-bit elements or more,
26829 // narrow back down to 64-bit elements.
26830 SmallVector<int, 64> Mask;
26831 if (BaseMaskEltSizeInBits > 64) {
26832 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
26833 int MaskScale = BaseMaskEltSizeInBits / 64;
26834 scaleShuffleMask(MaskScale, BaseMask, Mask);
26836 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
26839 unsigned NumMaskElts = Mask.size();
26840 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
26842 // Determine the effective mask value type.
26843 FloatDomain &= (32 <= MaskEltSizeInBits);
26844 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
26845 : MVT::getIntegerVT(MaskEltSizeInBits);
26846 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
26848 // Only allow legal mask types.
26849 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
26852 // Attempt to match the mask against known shuffle patterns.
26853 MVT ShuffleSrcVT, ShuffleVT;
26854 unsigned Shuffle, PermuteImm;
26856 if (UnaryShuffle) {
26857 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
26858 // directly if we don't shuffle the lower element and we shuffle the upper
26859 // (zero) elements within themselves.
26860 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
26861 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
26862 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
26863 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
26864 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
26865 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
26866 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
26872 if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
26873 ShuffleSrcVT, ShuffleVT)) {
26874 if (Depth == 1 && Root.getOpcode() == Shuffle)
26875 return false; // Nothing to do!
26876 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26877 return false; // AVX512 Writemask clash.
26878 Res = DAG.getBitcast(ShuffleSrcVT, V1);
26879 DCI.AddToWorklist(Res.getNode());
26880 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
26881 DCI.AddToWorklist(Res.getNode());
26882 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26887 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
26888 Shuffle, ShuffleVT, PermuteImm)) {
26889 if (Depth == 1 && Root.getOpcode() == Shuffle)
26890 return false; // Nothing to do!
26891 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26892 return false; // AVX512 Writemask clash.
26893 Res = DAG.getBitcast(ShuffleVT, V1);
26894 DCI.AddToWorklist(Res.getNode());
26895 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
26896 DAG.getConstant(PermuteImm, DL, MVT::i8));
26897 DCI.AddToWorklist(Res.getNode());
26898 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26904 if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
26905 Shuffle, ShuffleVT, UnaryShuffle)) {
26906 if (Depth == 1 && Root.getOpcode() == Shuffle)
26907 return false; // Nothing to do!
26908 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26909 return false; // AVX512 Writemask clash.
26910 V1 = DAG.getBitcast(ShuffleVT, V1);
26911 DCI.AddToWorklist(V1.getNode());
26912 V2 = DAG.getBitcast(ShuffleVT, V2);
26913 DCI.AddToWorklist(V2.getNode());
26914 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
26915 DCI.AddToWorklist(Res.getNode());
26916 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26921 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
26922 DAG, Subtarget, Shuffle, ShuffleVT,
26924 if (Depth == 1 && Root.getOpcode() == Shuffle)
26925 return false; // Nothing to do!
26926 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
26927 return false; // AVX512 Writemask clash.
26928 V1 = DAG.getBitcast(ShuffleVT, V1);
26929 DCI.AddToWorklist(V1.getNode());
26930 V2 = DAG.getBitcast(ShuffleVT, V2);
26931 DCI.AddToWorklist(V2.getNode());
26932 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
26933 DAG.getConstant(PermuteImm, DL, MVT::i8));
26934 DCI.AddToWorklist(Res.getNode());
26935 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26940 // Don't try to re-form single instruction chains under any circumstances now
26941 // that we've done encoding canonicalization for them.
26945 bool MaskContainsZeros =
26946 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
26948 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
26949 // If we have a single input lane-crossing shuffle then lower to VPERMV.
26950 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
26951 ((Subtarget.hasAVX2() &&
26952 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26953 (Subtarget.hasAVX512() &&
26954 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26955 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26956 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26957 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26958 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26959 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26960 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26961 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26962 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26963 DCI.AddToWorklist(VPermMask.getNode());
26964 Res = DAG.getBitcast(MaskVT, V1);
26965 DCI.AddToWorklist(Res.getNode());
26966 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
26967 DCI.AddToWorklist(Res.getNode());
26968 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
26973 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
26974 // vector as the second source.
26975 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
26976 ((Subtarget.hasAVX512() &&
26977 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
26978 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
26979 (Subtarget.hasVLX() &&
26980 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
26981 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
26982 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
26983 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
26984 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
26985 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
26986 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
26987 for (unsigned i = 0; i != NumMaskElts; ++i)
26988 if (Mask[i] == SM_SentinelZero)
26989 Mask[i] = NumMaskElts + i;
26991 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
26992 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
26993 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
26994 DCI.AddToWorklist(VPermMask.getNode());
26995 Res = DAG.getBitcast(MaskVT, V1);
26996 DCI.AddToWorklist(Res.getNode());
26997 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
26998 DCI.AddToWorklist(Zero.getNode());
26999 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27000 DCI.AddToWorklist(Res.getNode());
27001 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27006 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27007 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27008 ((Subtarget.hasAVX512() &&
27009 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27010 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27011 (Subtarget.hasVLX() &&
27012 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27013 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27014 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27015 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27016 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27017 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27018 MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits);
27019 MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts);
27020 SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true);
27021 DCI.AddToWorklist(VPermMask.getNode());
27022 V1 = DAG.getBitcast(MaskVT, V1);
27023 DCI.AddToWorklist(V1.getNode());
27024 V2 = DAG.getBitcast(MaskVT, V2);
27025 DCI.AddToWorklist(V2.getNode());
27026 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27027 DCI.AddToWorklist(Res.getNode());
27028 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27035 // See if we can combine a single input shuffle with zeros to a bit-mask,
27036 // which is much simpler than any shuffle.
27037 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27038 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27039 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27040 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27041 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27042 SmallBitVector UndefElts(NumMaskElts, false);
27043 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27044 for (unsigned i = 0; i != NumMaskElts; ++i) {
27046 if (M == SM_SentinelUndef) {
27047 UndefElts[i] = true;
27050 if (M == SM_SentinelZero)
27052 EltBits[i] = AllOnes;
27054 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27055 DCI.AddToWorklist(BitMask.getNode());
27056 Res = DAG.getBitcast(MaskVT, V1);
27057 DCI.AddToWorklist(Res.getNode());
27058 unsigned AndOpcode =
27059 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27060 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27061 DCI.AddToWorklist(Res.getNode());
27062 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27067 // If we have a single input shuffle with different shuffle patterns in the
27068 // the 128-bit lanes use the variable mask to VPERMILPS.
27069 // TODO Combine other mask types at higher depths.
27070 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27071 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27072 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27073 SmallVector<SDValue, 16> VPermIdx;
27074 for (int M : Mask) {
27076 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27077 VPermIdx.push_back(Idx);
27079 MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts);
27080 SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx);
27081 DCI.AddToWorklist(VPermMask.getNode());
27082 Res = DAG.getBitcast(MaskVT, V1);
27083 DCI.AddToWorklist(Res.getNode());
27084 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27085 DCI.AddToWorklist(Res.getNode());
27086 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27091 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27092 // to VPERMIL2PD/VPERMIL2PS.
27093 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27094 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27095 MaskVT == MVT::v8f32)) {
27096 // VPERMIL2 Operation.
27097 // Bits[3] - Match Bit.
27098 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27099 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27100 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27101 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27102 SmallVector<int, 8> VPerm2Idx;
27103 MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits());
27104 MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts);
27105 unsigned M2ZImm = 0;
27106 for (int M : Mask) {
27107 if (M == SM_SentinelUndef) {
27108 VPerm2Idx.push_back(-1);
27111 if (M == SM_SentinelZero) {
27113 VPerm2Idx.push_back(8);
27116 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27117 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27118 VPerm2Idx.push_back(Index);
27120 V1 = DAG.getBitcast(MaskVT, V1);
27121 DCI.AddToWorklist(V1.getNode());
27122 V2 = DAG.getBitcast(MaskVT, V2);
27123 DCI.AddToWorklist(V2.getNode());
27124 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true);
27125 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27126 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27127 DAG.getConstant(M2ZImm, DL, MVT::i8));
27128 DCI.AddToWorklist(Res.getNode());
27129 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27134 // If we have 3 or more shuffle instructions or a chain involving a variable
27135 // mask, we can replace them with a single PSHUFB instruction profitably.
27136 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27137 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27138 // more aggressive.
27139 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27140 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27141 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27142 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27143 SmallVector<SDValue, 16> PSHUFBMask;
27144 int NumBytes = RootVT.getSizeInBits() / 8;
27145 int Ratio = NumBytes / NumMaskElts;
27146 for (int i = 0; i < NumBytes; ++i) {
27147 int M = Mask[i / Ratio];
27148 if (M == SM_SentinelUndef) {
27149 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27152 if (M == SM_SentinelZero) {
27153 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27156 M = Ratio * M + i % Ratio;
27157 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27158 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27160 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27161 Res = DAG.getBitcast(ByteVT, V1);
27162 DCI.AddToWorklist(Res.getNode());
27163 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
27164 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
27165 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
27166 DCI.AddToWorklist(Res.getNode());
27167 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27172 // With XOP, if we have a 128-bit binary input shuffle we can always combine
27173 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
27174 // slower than PSHUFB on targets that support both.
27175 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
27176 Subtarget.hasXOP()) {
27177 // VPPERM Mask Operation
27178 // Bits[4:0] - Byte Index (0 - 31)
27179 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
27180 SmallVector<SDValue, 16> VPPERMMask;
27182 int Ratio = NumBytes / NumMaskElts;
27183 for (int i = 0; i < NumBytes; ++i) {
27184 int M = Mask[i / Ratio];
27185 if (M == SM_SentinelUndef) {
27186 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
27189 if (M == SM_SentinelZero) {
27190 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
27193 M = Ratio * M + i % Ratio;
27194 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27196 MVT ByteVT = MVT::v16i8;
27197 V1 = DAG.getBitcast(ByteVT, V1);
27198 DCI.AddToWorklist(V1.getNode());
27199 V2 = DAG.getBitcast(ByteVT, V2);
27200 DCI.AddToWorklist(V2.getNode());
27201 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
27202 DCI.AddToWorklist(VPPERMMaskOp.getNode());
27203 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
27204 DCI.AddToWorklist(Res.getNode());
27205 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27210 // Failed to find any combines.
27214 // Attempt to constant fold all of the constant source ops.
27215 // Returns true if the entire shuffle is folded to a constant.
27216 // TODO: Extend this to merge multiple constant Ops and update the mask.
27217 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
27218 ArrayRef<int> Mask, SDValue Root,
27219 bool HasVariableMask, SelectionDAG &DAG,
27220 TargetLowering::DAGCombinerInfo &DCI,
27221 const X86Subtarget &Subtarget) {
27222 MVT VT = Root.getSimpleValueType();
27224 unsigned SizeInBits = VT.getSizeInBits();
27225 unsigned NumMaskElts = Mask.size();
27226 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
27227 unsigned NumOps = Ops.size();
27229 // Extract constant bits from each source op.
27230 bool OneUseConstantOp = false;
27231 SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
27232 SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
27233 for (unsigned i = 0; i != NumOps; ++i) {
27234 SDValue SrcOp = Ops[i];
27235 OneUseConstantOp |= SrcOp.hasOneUse();
27236 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
27241 // Only fold if at least one of the constants is only used once or
27242 // the combined shuffle has included a variable mask shuffle, this
27243 // is to avoid constant pool bloat.
27244 if (!OneUseConstantOp && !HasVariableMask)
27247 // Shuffle the constant bits according to the mask.
27248 SmallBitVector UndefElts(NumMaskElts, false);
27249 SmallBitVector ZeroElts(NumMaskElts, false);
27250 SmallBitVector ConstantElts(NumMaskElts, false);
27251 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
27252 APInt::getNullValue(MaskSizeInBits));
27253 for (unsigned i = 0; i != NumMaskElts; ++i) {
27255 if (M == SM_SentinelUndef) {
27256 UndefElts[i] = true;
27258 } else if (M == SM_SentinelZero) {
27259 ZeroElts[i] = true;
27262 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
27264 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
27265 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
27267 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
27268 if (SrcUndefElts[SrcMaskIdx]) {
27269 UndefElts[i] = true;
27273 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
27274 APInt &Bits = SrcEltBits[SrcMaskIdx];
27276 ZeroElts[i] = true;
27280 ConstantElts[i] = true;
27281 ConstantBitData[i] = Bits;
27283 assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
27285 // Create the constant data.
27287 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
27288 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
27290 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
27292 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
27295 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
27296 DCI.AddToWorklist(CstOp.getNode());
27297 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
27301 /// \brief Fully generic combining of x86 shuffle instructions.
27303 /// This should be the last combine run over the x86 shuffle instructions. Once
27304 /// they have been fully optimized, this will recursively consider all chains
27305 /// of single-use shuffle instructions, build a generic model of the cumulative
27306 /// shuffle operation, and check for simpler instructions which implement this
27307 /// operation. We use this primarily for two purposes:
27309 /// 1) Collapse generic shuffles to specialized single instructions when
27310 /// equivalent. In most cases, this is just an encoding size win, but
27311 /// sometimes we will collapse multiple generic shuffles into a single
27312 /// special-purpose shuffle.
27313 /// 2) Look for sequences of shuffle instructions with 3 or more total
27314 /// instructions, and replace them with the slightly more expensive SSSE3
27315 /// PSHUFB instruction if available. We do this as the last combining step
27316 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
27317 /// a suitable short sequence of other instructions. The PSHUFB will either
27318 /// use a register or have to read from memory and so is slightly (but only
27319 /// slightly) more expensive than the other shuffle instructions.
27321 /// Because this is inherently a quadratic operation (for each shuffle in
27322 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
27323 /// This should never be an issue in practice as the shuffle lowering doesn't
27324 /// produce sequences of more than 8 instructions.
27326 /// FIXME: We will currently miss some cases where the redundant shuffling
27327 /// would simplify under the threshold for PSHUFB formation because of
27328 /// combine-ordering. To fix this, we should do the redundant instruction
27329 /// combining in this recursive walk.
27330 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
27331 int SrcOpIndex, SDValue Root,
27332 ArrayRef<int> RootMask,
27333 int Depth, bool HasVariableMask,
27335 TargetLowering::DAGCombinerInfo &DCI,
27336 const X86Subtarget &Subtarget) {
27337 // Bound the depth of our recursive combine because this is ultimately
27338 // quadratic in nature.
27342 // Directly rip through bitcasts to find the underlying operand.
27343 SDValue Op = SrcOps[SrcOpIndex];
27344 Op = peekThroughOneUseBitcasts(Op);
27346 MVT VT = Op.getSimpleValueType();
27347 if (!VT.isVector())
27348 return false; // Bail if we hit a non-vector.
27350 assert(Root.getSimpleValueType().isVector() &&
27351 "Shuffles operate on vector types!");
27352 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
27353 "Can only combine shuffles of the same vector register size.");
27355 // Extract target shuffle mask and resolve sentinels and inputs.
27356 SDValue Input0, Input1;
27357 SmallVector<int, 16> OpMask;
27358 if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask))
27361 // Add the inputs to the Ops list, avoiding duplicates.
27362 SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
27364 int InputIdx0 = -1, InputIdx1 = -1;
27365 for (int i = 0, e = Ops.size(); i < e; ++i) {
27366 SDValue BC = peekThroughBitcasts(Ops[i]);
27367 if (Input0 && BC == peekThroughBitcasts(Input0))
27369 if (Input1 && BC == peekThroughBitcasts(Input1))
27373 if (Input0 && InputIdx0 < 0) {
27374 InputIdx0 = SrcOpIndex;
27375 Ops[SrcOpIndex] = Input0;
27377 if (Input1 && InputIdx1 < 0) {
27378 InputIdx1 = Ops.size();
27379 Ops.push_back(Input1);
27382 assert(((RootMask.size() > OpMask.size() &&
27383 RootMask.size() % OpMask.size() == 0) ||
27384 (OpMask.size() > RootMask.size() &&
27385 OpMask.size() % RootMask.size() == 0) ||
27386 OpMask.size() == RootMask.size()) &&
27387 "The smaller number of elements must divide the larger.");
27388 int MaskWidth = std::max<int>(OpMask.size(), RootMask.size());
27389 int RootRatio = std::max<int>(1, OpMask.size() / RootMask.size());
27390 int OpRatio = std::max<int>(1, RootMask.size() / OpMask.size());
27391 assert(((RootRatio == 1 && OpRatio == 1) ||
27392 (RootRatio == 1) != (OpRatio == 1)) &&
27393 "Must not have a ratio for both incoming and op masks!");
27395 SmallVector<int, 16> Mask;
27396 Mask.reserve(MaskWidth);
27398 // Merge this shuffle operation's mask into our accumulated mask. Note that
27399 // this shuffle's mask will be the first applied to the input, followed by the
27400 // root mask to get us all the way to the root value arrangement. The reason
27401 // for this order is that we are recursing up the operation chain.
27402 for (int i = 0; i < MaskWidth; ++i) {
27403 int RootIdx = i / RootRatio;
27404 if (RootMask[RootIdx] < 0) {
27405 // This is a zero or undef lane, we're done.
27406 Mask.push_back(RootMask[RootIdx]);
27410 int RootMaskedIdx = RootMask[RootIdx] * RootRatio + i % RootRatio;
27412 // Just insert the scaled root mask value if it references an input other
27413 // than the SrcOp we're currently inserting.
27414 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
27415 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
27416 Mask.push_back(RootMaskedIdx);
27420 RootMaskedIdx %= MaskWidth;
27422 int OpIdx = RootMaskedIdx / OpRatio;
27423 if (OpMask[OpIdx] < 0) {
27424 // The incoming lanes are zero or undef, it doesn't matter which ones we
27426 Mask.push_back(OpMask[OpIdx]);
27430 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
27431 int OpMaskedIdx = OpMask[OpIdx] * OpRatio + RootMaskedIdx % OpRatio;
27432 OpMaskedIdx %= MaskWidth;
27434 if (OpMask[OpIdx] < (int)OpMask.size()) {
27435 assert(0 <= InputIdx0 && "Unknown target shuffle input");
27436 OpMaskedIdx += InputIdx0 * MaskWidth;
27438 assert(0 <= InputIdx1 && "Unknown target shuffle input");
27439 OpMaskedIdx += InputIdx1 * MaskWidth;
27442 Mask.push_back(OpMaskedIdx);
27445 // Handle the all undef/zero cases early.
27446 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
27447 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
27450 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
27451 // TODO - should we handle the mixed zero/undef case as well? Just returning
27452 // a zero mask will lose information on undef elements possibly reducing
27453 // future combine possibilities.
27454 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
27455 Subtarget, DAG, SDLoc(Root)));
27459 // Remove unused shuffle source ops.
27460 SmallVector<SDValue, 8> UsedOps;
27461 for (int i = 0, e = Ops.size(); i < e; ++i) {
27462 int lo = UsedOps.size() * MaskWidth;
27463 int hi = lo + MaskWidth;
27464 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
27465 UsedOps.push_back(Ops[i]);
27468 for (int &M : Mask)
27472 assert(!UsedOps.empty() && "Shuffle with no inputs detected");
27475 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
27477 // See if we can recurse into each shuffle source op (if it's a target shuffle).
27478 for (int i = 0, e = Ops.size(); i < e; ++i)
27479 if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
27480 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
27481 HasVariableMask, DAG, DCI, Subtarget))
27484 // Attempt to constant fold all of the constant source ops.
27485 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
27489 // We can only combine unary and binary shuffle mask cases.
27490 if (Ops.size() > 2)
27493 // Minor canonicalization of the accumulated shuffle mask to make it easier
27494 // to match below. All this does is detect masks with sequential pairs of
27495 // elements, and shrink them to the half-width mask. It does this in a loop
27496 // so it will reduce the size of the mask to the minimal width mask which
27497 // performs an equivalent shuffle.
27498 SmallVector<int, 16> WidenedMask;
27499 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
27500 Mask = std::move(WidenedMask);
27503 // Canonicalization of binary shuffle masks to improve pattern matching by
27504 // commuting the inputs.
27505 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
27506 ShuffleVectorSDNode::commuteMask(Mask);
27507 std::swap(Ops[0], Ops[1]);
27510 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
27514 /// \brief Get the PSHUF-style mask from PSHUF node.
27516 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
27517 /// PSHUF-style masks that can be reused with such instructions.
27518 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
27519 MVT VT = N.getSimpleValueType();
27520 SmallVector<int, 4> Mask;
27521 SmallVector<SDValue, 2> Ops;
27524 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
27528 // If we have more than 128-bits, only the low 128-bits of shuffle mask
27529 // matter. Check that the upper masks are repeats and remove them.
27530 if (VT.getSizeInBits() > 128) {
27531 int LaneElts = 128 / VT.getScalarSizeInBits();
27533 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
27534 for (int j = 0; j < LaneElts; ++j)
27535 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
27536 "Mask doesn't repeat in high 128-bit lanes!");
27538 Mask.resize(LaneElts);
27541 switch (N.getOpcode()) {
27542 case X86ISD::PSHUFD:
27544 case X86ISD::PSHUFLW:
27547 case X86ISD::PSHUFHW:
27548 Mask.erase(Mask.begin(), Mask.begin() + 4);
27549 for (int &M : Mask)
27553 llvm_unreachable("No valid shuffle instruction found!");
27557 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
27559 /// We walk up the chain and look for a combinable shuffle, skipping over
27560 /// shuffles that we could hoist this shuffle's transformation past without
27561 /// altering anything.
27563 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
27565 TargetLowering::DAGCombinerInfo &DCI) {
27566 assert(N.getOpcode() == X86ISD::PSHUFD &&
27567 "Called with something other than an x86 128-bit half shuffle!");
27570 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
27571 // of the shuffles in the chain so that we can form a fresh chain to replace
27573 SmallVector<SDValue, 8> Chain;
27574 SDValue V = N.getOperand(0);
27575 for (; V.hasOneUse(); V = V.getOperand(0)) {
27576 switch (V.getOpcode()) {
27578 return SDValue(); // Nothing combined!
27581 // Skip bitcasts as we always know the type for the target specific
27585 case X86ISD::PSHUFD:
27586 // Found another dword shuffle.
27589 case X86ISD::PSHUFLW:
27590 // Check that the low words (being shuffled) are the identity in the
27591 // dword shuffle, and the high words are self-contained.
27592 if (Mask[0] != 0 || Mask[1] != 1 ||
27593 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
27596 Chain.push_back(V);
27599 case X86ISD::PSHUFHW:
27600 // Check that the high words (being shuffled) are the identity in the
27601 // dword shuffle, and the low words are self-contained.
27602 if (Mask[2] != 2 || Mask[3] != 3 ||
27603 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
27606 Chain.push_back(V);
27609 case X86ISD::UNPCKL:
27610 case X86ISD::UNPCKH:
27611 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
27612 // shuffle into a preceding word shuffle.
27613 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
27614 V.getSimpleValueType().getVectorElementType() != MVT::i16)
27617 // Search for a half-shuffle which we can combine with.
27618 unsigned CombineOp =
27619 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
27620 if (V.getOperand(0) != V.getOperand(1) ||
27621 !V->isOnlyUserOf(V.getOperand(0).getNode()))
27623 Chain.push_back(V);
27624 V = V.getOperand(0);
27626 switch (V.getOpcode()) {
27628 return SDValue(); // Nothing to combine.
27630 case X86ISD::PSHUFLW:
27631 case X86ISD::PSHUFHW:
27632 if (V.getOpcode() == CombineOp)
27635 Chain.push_back(V);
27639 V = V.getOperand(0);
27643 } while (V.hasOneUse());
27646 // Break out of the loop if we break out of the switch.
27650 if (!V.hasOneUse())
27651 // We fell out of the loop without finding a viable combining instruction.
27654 // Merge this node's mask and our incoming mask.
27655 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27656 for (int &M : Mask)
27658 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
27659 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27661 // Rebuild the chain around this new shuffle.
27662 while (!Chain.empty()) {
27663 SDValue W = Chain.pop_back_val();
27665 if (V.getValueType() != W.getOperand(0).getValueType())
27666 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
27668 switch (W.getOpcode()) {
27670 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
27672 case X86ISD::UNPCKL:
27673 case X86ISD::UNPCKH:
27674 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
27677 case X86ISD::PSHUFD:
27678 case X86ISD::PSHUFLW:
27679 case X86ISD::PSHUFHW:
27680 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
27684 if (V.getValueType() != N.getValueType())
27685 V = DAG.getBitcast(N.getValueType(), V);
27687 // Return the new chain to replace N.
27691 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
27694 /// We walk up the chain, skipping shuffles of the other half and looking
27695 /// through shuffles which switch halves trying to find a shuffle of the same
27696 /// pair of dwords.
27697 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
27699 TargetLowering::DAGCombinerInfo &DCI) {
27701 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
27702 "Called with something other than an x86 128-bit half shuffle!");
27704 unsigned CombineOpcode = N.getOpcode();
27706 // Walk up a single-use chain looking for a combinable shuffle.
27707 SDValue V = N.getOperand(0);
27708 for (; V.hasOneUse(); V = V.getOperand(0)) {
27709 switch (V.getOpcode()) {
27711 return false; // Nothing combined!
27714 // Skip bitcasts as we always know the type for the target specific
27718 case X86ISD::PSHUFLW:
27719 case X86ISD::PSHUFHW:
27720 if (V.getOpcode() == CombineOpcode)
27723 // Other-half shuffles are no-ops.
27726 // Break out of the loop if we break out of the switch.
27730 if (!V.hasOneUse())
27731 // We fell out of the loop without finding a viable combining instruction.
27734 // Combine away the bottom node as its shuffle will be accumulated into
27735 // a preceding shuffle.
27736 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27738 // Record the old value.
27741 // Merge this node's mask and our incoming mask (adjusted to account for all
27742 // the pshufd instructions encountered).
27743 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
27744 for (int &M : Mask)
27746 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
27747 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
27749 // Check that the shuffles didn't cancel each other out. If not, we need to
27750 // combine to the new one.
27752 // Replace the combinable shuffle with the combined one, updating all users
27753 // so that we re-evaluate the chain here.
27754 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
27759 /// \brief Try to combine x86 target specific shuffles.
27760 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
27761 TargetLowering::DAGCombinerInfo &DCI,
27762 const X86Subtarget &Subtarget) {
27764 MVT VT = N.getSimpleValueType();
27765 SmallVector<int, 4> Mask;
27767 unsigned Opcode = N.getOpcode();
27769 case X86ISD::PSHUFD:
27770 case X86ISD::PSHUFLW:
27771 case X86ISD::PSHUFHW:
27772 Mask = getPSHUFShuffleMask(N);
27773 assert(Mask.size() == 4);
27775 case X86ISD::UNPCKL: {
27776 auto Op0 = N.getOperand(0);
27777 auto Op1 = N.getOperand(1);
27778 unsigned Opcode0 = Op0.getOpcode();
27779 unsigned Opcode1 = Op1.getOpcode();
27781 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
27782 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
27783 // TODO: Add other horizontal operations as required.
27784 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
27785 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
27787 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
27788 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
27789 // moves upper half elements into the lower half part. For example:
27791 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
27793 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
27795 // will be combined to:
27797 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
27799 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
27800 // happen due to advanced instructions.
27801 if (!VT.is128BitVector())
27804 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
27805 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
27807 unsigned NumElts = VT.getVectorNumElements();
27808 SmallVector<int, 8> ExpectedMask(NumElts, -1);
27809 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
27812 auto ShufOp = Op1.getOperand(0);
27813 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
27814 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
27818 case X86ISD::BLENDI: {
27819 SDValue V0 = N->getOperand(0);
27820 SDValue V1 = N->getOperand(1);
27821 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
27822 "Unexpected input vector types");
27824 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
27825 // operands and changing the mask to 1. This saves us a bunch of
27826 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
27827 // x86InstrInfo knows how to commute this back after instruction selection
27828 // if it would help register allocation.
27830 // TODO: If optimizing for size or a processor that doesn't suffer from
27831 // partial register update stalls, this should be transformed into a MOVSD
27832 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
27834 if (VT == MVT::v2f64)
27835 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
27836 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
27837 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
27838 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
27843 case X86ISD::MOVSD:
27844 case X86ISD::MOVSS: {
27845 bool isFloat = VT.isFloatingPoint();
27846 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
27847 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
27848 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
27849 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
27850 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
27851 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
27852 assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
27854 // We often lower to MOVSD/MOVSS from integer as well as native float
27855 // types; remove unnecessary domain-crossing bitcasts if we can to make it
27856 // easier to combine shuffles later on. We've already accounted for the
27857 // domain switching cost when we decided to lower with it.
27858 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
27859 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
27860 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
27861 V0 = DAG.getBitcast(NewVT, V0);
27862 V1 = DAG.getBitcast(NewVT, V1);
27863 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
27868 case X86ISD::INSERTPS: {
27869 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
27870 SDValue Op0 = N.getOperand(0);
27871 SDValue Op1 = N.getOperand(1);
27872 SDValue Op2 = N.getOperand(2);
27873 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
27874 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
27875 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
27876 unsigned ZeroMask = InsertPSMask & 0xF;
27878 // If we zero out all elements from Op0 then we don't need to reference it.
27879 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
27880 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
27881 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27883 // If we zero out the element from Op1 then we don't need to reference it.
27884 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
27885 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27886 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27888 // Attempt to merge insertps Op1 with an inner target shuffle node.
27889 SmallVector<int, 8> TargetMask1;
27890 SmallVector<SDValue, 2> Ops1;
27891 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
27892 int M = TargetMask1[SrcIdx];
27893 if (isUndefOrZero(M)) {
27894 // Zero/UNDEF insertion - zero out element and remove dependency.
27895 InsertPSMask |= (1u << DstIdx);
27896 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
27897 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27899 // Update insertps mask srcidx and reference the source input directly.
27900 assert(0 <= M && M < 8 && "Shuffle index out of range");
27901 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
27902 Op1 = Ops1[M < 4 ? 0 : 1];
27903 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27904 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27907 // Attempt to merge insertps Op0 with an inner target shuffle node.
27908 SmallVector<int, 8> TargetMask0;
27909 SmallVector<SDValue, 2> Ops0;
27910 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
27913 bool Updated = false;
27914 bool UseInput00 = false;
27915 bool UseInput01 = false;
27916 for (int i = 0; i != 4; ++i) {
27917 int M = TargetMask0[i];
27918 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
27919 // No change if element is already zero or the inserted element.
27921 } else if (isUndefOrZero(M)) {
27922 // If the target mask is undef/zero then we must zero the element.
27923 InsertPSMask |= (1u << i);
27928 // The input vector element must be inline.
27929 if (M != i && M != (i + 4))
27932 // Determine which inputs of the target shuffle we're using.
27933 UseInput00 |= (0 <= M && M < 4);
27934 UseInput01 |= (4 <= M);
27937 // If we're not using both inputs of the target shuffle then use the
27938 // referenced input directly.
27939 if (UseInput00 && !UseInput01) {
27942 } else if (!UseInput00 && UseInput01) {
27948 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
27949 DAG.getConstant(InsertPSMask, DL, MVT::i8));
27957 // Nuke no-op shuffles that show up after combining.
27958 if (isNoopShuffleMask(Mask))
27959 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
27961 // Look for simplifications involving one or two shuffle instructions.
27962 SDValue V = N.getOperand(0);
27963 switch (N.getOpcode()) {
27966 case X86ISD::PSHUFLW:
27967 case X86ISD::PSHUFHW:
27968 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
27970 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
27971 return SDValue(); // We combined away this shuffle, so we're done.
27973 // See if this reduces to a PSHUFD which is no more expensive and can
27974 // combine with more operations. Note that it has to at least flip the
27975 // dwords as otherwise it would have been removed as a no-op.
27976 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
27977 int DMask[] = {0, 1, 2, 3};
27978 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
27979 DMask[DOffset + 0] = DOffset + 1;
27980 DMask[DOffset + 1] = DOffset + 0;
27981 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
27982 V = DAG.getBitcast(DVT, V);
27983 DCI.AddToWorklist(V.getNode());
27984 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
27985 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
27986 DCI.AddToWorklist(V.getNode());
27987 return DAG.getBitcast(VT, V);
27990 // Look for shuffle patterns which can be implemented as a single unpack.
27991 // FIXME: This doesn't handle the location of the PSHUFD generically, and
27992 // only works when we have a PSHUFD followed by two half-shuffles.
27993 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
27994 (V.getOpcode() == X86ISD::PSHUFLW ||
27995 V.getOpcode() == X86ISD::PSHUFHW) &&
27996 V.getOpcode() != N.getOpcode() &&
27998 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
27999 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28000 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28001 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28002 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28003 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28005 for (int i = 0; i < 4; ++i) {
28006 WordMask[i + NOffset] = Mask[i] + NOffset;
28007 WordMask[i + VOffset] = VMask[i] + VOffset;
28009 // Map the word mask through the DWord mask.
28011 for (int i = 0; i < 8; ++i)
28012 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28013 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28014 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28015 // We can replace all three shuffles with an unpack.
28016 V = DAG.getBitcast(VT, D.getOperand(0));
28017 DCI.AddToWorklist(V.getNode());
28018 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28027 case X86ISD::PSHUFD:
28028 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
28037 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28038 /// operation. If true is returned then the operands of ADDSUB operation
28039 /// are written to the parameters \p Opnd0 and \p Opnd1.
28041 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28042 /// so it is easier to generically match. We also insert dummy vector shuffle
28043 /// nodes for the operands which explicitly discard the lanes which are unused
28044 /// by this operation to try to flow through the rest of the combiner
28045 /// the fact that they're unused.
28046 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28047 SDValue &Opnd0, SDValue &Opnd1) {
28049 EVT VT = N->getValueType(0);
28050 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28051 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28052 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28055 // We only handle target-independent shuffles.
28056 // FIXME: It would be easy and harmless to use the target shuffle mask
28057 // extraction tool to support more.
28058 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28061 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28062 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28064 SDValue V1 = N->getOperand(0);
28065 SDValue V2 = N->getOperand(1);
28067 // We require the first shuffle operand to be the FSUB node, and the second to
28068 // be the FADD node.
28069 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28070 ShuffleVectorSDNode::commuteMask(Mask);
28072 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28075 // If there are other uses of these operations we can't fold them.
28076 if (!V1->hasOneUse() || !V2->hasOneUse())
28079 // Ensure that both operations have the same operands. Note that we can
28080 // commute the FADD operands.
28081 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28082 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28083 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28086 // We're looking for blends between FADD and FSUB nodes. We insist on these
28087 // nodes being lined up in a specific expected pattern.
28088 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28089 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28090 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28091 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28092 8, 25, 10, 27, 12, 29, 14, 31})))
28100 /// \brief Try to combine a shuffle into a target-specific add-sub or
28101 /// mul-add-sub node.
28102 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28103 const X86Subtarget &Subtarget,
28104 SelectionDAG &DAG) {
28105 SDValue Opnd0, Opnd1;
28106 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28109 EVT VT = N->getValueType(0);
28112 // Try to generate X86ISD::FMADDSUB node here.
28114 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28115 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28117 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28118 // the ADDSUB idiom has been successfully recognized. There are no known
28119 // X86 targets with 512-bit ADDSUB instructions!
28120 if (VT.is512BitVector())
28123 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28126 // We are looking for a shuffle where both sources are concatenated with undef
28127 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28128 // if we can express this as a single-source shuffle, that's preferable.
28129 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28130 const X86Subtarget &Subtarget) {
28131 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28134 EVT VT = N->getValueType(0);
28136 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28137 if (!VT.is128BitVector() && !VT.is256BitVector())
28140 if (VT.getVectorElementType() != MVT::i32 &&
28141 VT.getVectorElementType() != MVT::i64 &&
28142 VT.getVectorElementType() != MVT::f32 &&
28143 VT.getVectorElementType() != MVT::f64)
28146 SDValue N0 = N->getOperand(0);
28147 SDValue N1 = N->getOperand(1);
28149 // Check that both sources are concats with undef.
28150 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
28151 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
28152 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
28153 !N1.getOperand(1).isUndef())
28156 // Construct the new shuffle mask. Elements from the first source retain their
28157 // index, but elements from the second source no longer need to skip an undef.
28158 SmallVector<int, 8> Mask;
28159 int NumElts = VT.getVectorNumElements();
28161 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28162 for (int Elt : SVOp->getMask())
28163 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
28166 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
28168 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
28171 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
28172 TargetLowering::DAGCombinerInfo &DCI,
28173 const X86Subtarget &Subtarget) {
28175 EVT VT = N->getValueType(0);
28177 // Don't create instructions with illegal types after legalize types has run.
28178 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28179 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
28182 // If we have legalized the vector types, look for blends of FADD and FSUB
28183 // nodes that we can fuse into an ADDSUB node.
28184 if (TLI.isTypeLegal(VT))
28185 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
28188 // During Type Legalization, when promoting illegal vector types,
28189 // the backend might introduce new shuffle dag nodes and bitcasts.
28191 // This code performs the following transformation:
28192 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
28193 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
28195 // We do this only if both the bitcast and the BINOP dag nodes have
28196 // one use. Also, perform this transformation only if the new binary
28197 // operation is legal. This is to avoid introducing dag nodes that
28198 // potentially need to be further expanded (or custom lowered) into a
28199 // less optimal sequence of dag nodes.
28200 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
28201 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
28202 N->getOperand(0).getOpcode() == ISD::BITCAST &&
28203 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
28204 SDValue N0 = N->getOperand(0);
28205 SDValue N1 = N->getOperand(1);
28207 SDValue BC0 = N0.getOperand(0);
28208 EVT SVT = BC0.getValueType();
28209 unsigned Opcode = BC0.getOpcode();
28210 unsigned NumElts = VT.getVectorNumElements();
28212 if (BC0.hasOneUse() && SVT.isVector() &&
28213 SVT.getVectorNumElements() * 2 == NumElts &&
28214 TLI.isOperationLegal(Opcode, VT)) {
28215 bool CanFold = false;
28221 // isOperationLegal lies for integer ops on floating point types.
28222 CanFold = VT.isInteger();
28227 // isOperationLegal lies for floating point ops on integer types.
28228 CanFold = VT.isFloatingPoint();
28232 unsigned SVTNumElts = SVT.getVectorNumElements();
28233 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
28234 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
28235 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
28236 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
28237 CanFold = SVOp->getMaskElt(i) < 0;
28240 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
28241 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
28242 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
28243 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
28248 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
28249 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
28250 // consecutive, non-overlapping, and in the right order.
28251 SmallVector<SDValue, 16> Elts;
28252 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
28253 Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
28255 if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
28258 // For AVX2, we sometimes want to combine
28259 // (vector_shuffle <mask> (concat_vectors t1, undef)
28260 // (concat_vectors t2, undef))
28262 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
28263 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
28264 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
28267 if (isTargetShuffle(N->getOpcode())) {
28269 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
28272 // Try recursively combining arbitrary sequences of x86 shuffle
28273 // instructions into higher-order shuffles. We do this after combining
28274 // specific PSHUF instruction sequences into their minimal form so that we
28275 // can evaluate how many specialized shuffle instructions are involved in
28276 // a particular chain.
28277 SmallVector<int, 1> NonceMask; // Just a placeholder.
28278 NonceMask.push_back(0);
28279 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
28280 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
28282 return SDValue(); // This routine will use CombineTo to replace N.
28288 /// Check if a vector extract from a target-specific shuffle of a load can be
28289 /// folded into a single element load.
28290 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
28291 /// shuffles have been custom lowered so we need to handle those here.
28292 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
28293 TargetLowering::DAGCombinerInfo &DCI) {
28294 if (DCI.isBeforeLegalizeOps())
28297 SDValue InVec = N->getOperand(0);
28298 SDValue EltNo = N->getOperand(1);
28299 EVT EltVT = N->getValueType(0);
28301 if (!isa<ConstantSDNode>(EltNo))
28304 EVT OriginalVT = InVec.getValueType();
28306 if (InVec.getOpcode() == ISD::BITCAST) {
28307 // Don't duplicate a load with other uses.
28308 if (!InVec.hasOneUse())
28310 EVT BCVT = InVec.getOperand(0).getValueType();
28311 if (!BCVT.isVector() ||
28312 BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
28314 InVec = InVec.getOperand(0);
28317 EVT CurrentVT = InVec.getValueType();
28319 if (!isTargetShuffle(InVec.getOpcode()))
28322 // Don't duplicate a load with other uses.
28323 if (!InVec.hasOneUse())
28326 SmallVector<int, 16> ShuffleMask;
28327 SmallVector<SDValue, 2> ShuffleOps;
28329 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
28330 ShuffleOps, ShuffleMask, UnaryShuffle))
28333 // Select the input vector, guarding against out of range extract vector.
28334 unsigned NumElems = CurrentVT.getVectorNumElements();
28335 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
28336 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
28338 if (Idx == SM_SentinelZero)
28339 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
28340 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
28341 if (Idx == SM_SentinelUndef)
28342 return DAG.getUNDEF(EltVT);
28344 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
28345 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
28348 // If inputs to shuffle are the same for both ops, then allow 2 uses
28349 unsigned AllowedUses =
28350 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
28352 if (LdNode.getOpcode() == ISD::BITCAST) {
28353 // Don't duplicate a load with other uses.
28354 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
28357 AllowedUses = 1; // only allow 1 load use if we have a bitcast
28358 LdNode = LdNode.getOperand(0);
28361 if (!ISD::isNormalLoad(LdNode.getNode()))
28364 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
28366 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
28369 // If there's a bitcast before the shuffle, check if the load type and
28370 // alignment is valid.
28371 unsigned Align = LN0->getAlignment();
28372 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28373 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
28374 EltVT.getTypeForEVT(*DAG.getContext()));
28376 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
28379 // All checks match so transform back to vector_shuffle so that DAG combiner
28380 // can finish the job
28383 // Create shuffle node taking into account the case that its a unary shuffle
28384 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
28385 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
28387 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
28388 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
28392 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
28393 const X86Subtarget &Subtarget) {
28394 SDValue N0 = N->getOperand(0);
28395 EVT VT = N->getValueType(0);
28397 // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
28398 // special and don't usually play with other vector types, it's better to
28399 // handle them early to be sure we emit efficient code by avoiding
28400 // store-load conversions.
28401 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
28402 N0.getValueType() == MVT::v2i32 &&
28403 isNullConstant(N0.getOperand(1))) {
28404 SDValue N00 = N0->getOperand(0);
28405 if (N00.getValueType() == MVT::i32)
28406 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
28409 // Convert a bitcasted integer logic operation that has one bitcasted
28410 // floating-point operand into a floating-point logic operation. This may
28411 // create a load of a constant, but that is cheaper than materializing the
28412 // constant in an integer register and transferring it to an SSE register or
28413 // transferring the SSE operand to integer register and back.
28415 switch (N0.getOpcode()) {
28416 case ISD::AND: FPOpcode = X86ISD::FAND; break;
28417 case ISD::OR: FPOpcode = X86ISD::FOR; break;
28418 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
28419 default: return SDValue();
28422 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
28423 (Subtarget.hasSSE2() && VT == MVT::f64)))
28426 SDValue LogicOp0 = N0.getOperand(0);
28427 SDValue LogicOp1 = N0.getOperand(1);
28430 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
28431 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
28432 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
28433 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
28434 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
28435 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
28437 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
28438 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
28439 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
28440 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
28441 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
28442 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
28448 // Match a binop + shuffle pyramid that represents a horizontal reduction over
28449 // the elements of a vector.
28450 // Returns the vector that is being reduced on, or SDValue() if a reduction
28451 // was not matched.
28452 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
28453 // The pattern must end in an extract from index 0.
28454 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
28455 !isNullConstant(Extract->getOperand(1)))
28459 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
28461 SDValue Op = Extract->getOperand(0);
28462 // At each stage, we're looking for something that looks like:
28463 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
28464 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
28465 // i32 undef, i32 undef, i32 undef, i32 undef>
28466 // %a = binop <8 x i32> %op, %s
28467 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
28468 // we expect something like:
28469 // <4,5,6,7,u,u,u,u>
28470 // <2,3,u,u,u,u,u,u>
28471 // <1,u,u,u,u,u,u,u>
28472 for (unsigned i = 0; i < Stages; ++i) {
28473 if (Op.getOpcode() != BinOp)
28476 ShuffleVectorSDNode *Shuffle =
28477 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
28479 Op = Op.getOperand(1);
28481 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
28482 Op = Op.getOperand(0);
28485 // The first operand of the shuffle should be the same as the other operand
28487 if (!Shuffle || (Shuffle->getOperand(0) != Op))
28490 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
28491 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
28492 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
28499 // Given a select, detect the following pattern:
28500 // 1: %2 = zext <N x i8> %0 to <N x i32>
28501 // 2: %3 = zext <N x i8> %1 to <N x i32>
28502 // 3: %4 = sub nsw <N x i32> %2, %3
28503 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
28504 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
28505 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
28506 // This is useful as it is the input into a SAD pattern.
28507 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
28509 // Check the condition of the select instruction is greater-than.
28510 SDValue SetCC = Select->getOperand(0);
28511 if (SetCC.getOpcode() != ISD::SETCC)
28513 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
28514 if (CC != ISD::SETGT)
28517 SDValue SelectOp1 = Select->getOperand(1);
28518 SDValue SelectOp2 = Select->getOperand(2);
28520 // The second operand of the select should be the negation of the first
28521 // operand, which is implemented as 0 - SelectOp1.
28522 if (!(SelectOp2.getOpcode() == ISD::SUB &&
28523 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
28524 SelectOp2.getOperand(1) == SelectOp1))
28527 // The first operand of SetCC is the first operand of the select, which is the
28528 // difference between the two input vectors.
28529 if (SetCC.getOperand(0) != SelectOp1)
28532 // The second operand of the comparison can be either -1 or 0.
28533 if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
28534 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
28537 // The first operand of the select is the difference between the two input
28539 if (SelectOp1.getOpcode() != ISD::SUB)
28542 Op0 = SelectOp1.getOperand(0);
28543 Op1 = SelectOp1.getOperand(1);
28545 // Check if the operands of the sub are zero-extended from vectors of i8.
28546 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
28547 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
28548 Op1.getOpcode() != ISD::ZERO_EXTEND ||
28549 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
28555 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
28557 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
28558 const SDValue &Zext1, const SDLoc &DL) {
28560 // Find the appropriate width for the PSADBW.
28561 EVT InVT = Zext0.getOperand(0).getValueType();
28562 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
28564 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
28565 // fill in the missing vector elements with 0.
28566 unsigned NumConcat = RegSize / InVT.getSizeInBits();
28567 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
28568 Ops[0] = Zext0.getOperand(0);
28569 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
28570 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28571 Ops[0] = Zext1.getOperand(0);
28572 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
28574 // Actually build the SAD
28575 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
28576 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
28579 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
28580 const X86Subtarget &Subtarget) {
28581 // PSADBW is only supported on SSE2 and up.
28582 if (!Subtarget.hasSSE2())
28585 // Verify the type we're extracting from is appropriate
28586 // TODO: There's nothing special about i32, any integer type above i16 should
28587 // work just as well.
28588 EVT VT = Extract->getOperand(0).getValueType();
28589 if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
28592 unsigned RegSize = 128;
28593 if (Subtarget.hasBWI())
28595 else if (Subtarget.hasAVX2())
28598 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
28599 // TODO: We should be able to handle larger vectors by splitting them before
28600 // feeding them into several SADs, and then reducing over those.
28601 if (VT.getSizeInBits() / 4 > RegSize)
28604 // Match shuffle + add pyramid.
28605 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
28607 // If there was a match, we want Root to be a select that is the root of an
28608 // abs-diff pattern.
28609 if (!Root || (Root.getOpcode() != ISD::VSELECT))
28612 // Check whether we have an abs-diff pattern feeding into the select.
28613 SDValue Zext0, Zext1;
28614 if (!detectZextAbsDiff(Root, Zext0, Zext1))
28617 // Create the SAD instruction
28619 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
28621 // If the original vector was wider than 8 elements, sum over the results
28622 // in the SAD vector.
28623 unsigned Stages = Log2_32(VT.getVectorNumElements());
28624 MVT SadVT = SAD.getSimpleValueType();
28626 unsigned SadElems = SadVT.getVectorNumElements();
28628 for(unsigned i = Stages - 3; i > 0; --i) {
28629 SmallVector<int, 16> Mask(SadElems, -1);
28630 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
28631 Mask[j] = MaskEnd + j;
28634 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
28635 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
28639 // Return the lowest i32.
28640 MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
28641 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
28642 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
28643 Extract->getOperand(1));
28646 /// Detect vector gather/scatter index generation and convert it from being a
28647 /// bunch of shuffles and extracts into a somewhat faster sequence.
28648 /// For i686, the best sequence is apparently storing the value and loading
28649 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
28650 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
28651 TargetLowering::DAGCombinerInfo &DCI,
28652 const X86Subtarget &Subtarget) {
28653 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
28656 SDValue InputVector = N->getOperand(0);
28657 SDLoc dl(InputVector);
28658 // Detect mmx to i32 conversion through a v2i32 elt extract.
28659 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
28660 N->getValueType(0) == MVT::i32 &&
28661 InputVector.getValueType() == MVT::v2i32 &&
28662 isa<ConstantSDNode>(N->getOperand(1)) &&
28663 N->getConstantOperandVal(1) == 0) {
28664 SDValue MMXSrc = InputVector.getOperand(0);
28666 // The bitcast source is a direct mmx result.
28667 if (MMXSrc.getValueType() == MVT::x86mmx)
28668 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
28671 EVT VT = N->getValueType(0);
28673 if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
28674 InputVector.getOpcode() == ISD::BITCAST &&
28675 isa<ConstantSDNode>(InputVector.getOperand(0))) {
28676 uint64_t ExtractedElt =
28677 cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
28678 uint64_t InputValue =
28679 cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
28680 uint64_t Res = (InputValue >> ExtractedElt) & 1;
28681 return DAG.getConstant(Res, dl, MVT::i1);
28684 // Check whether this extract is the root of a sum of absolute differences
28685 // pattern. This has to be done here because we really want it to happen
28686 // pre-legalization,
28687 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
28690 // Only operate on vectors of 4 elements, where the alternative shuffling
28691 // gets to be more expensive.
28692 if (InputVector.getValueType() != MVT::v4i32)
28695 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
28696 // single use which is a sign-extend or zero-extend, and all elements are
28698 SmallVector<SDNode *, 4> Uses;
28699 unsigned ExtractedElements = 0;
28700 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
28701 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
28702 if (UI.getUse().getResNo() != InputVector.getResNo())
28705 SDNode *Extract = *UI;
28706 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
28709 if (Extract->getValueType(0) != MVT::i32)
28711 if (!Extract->hasOneUse())
28713 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
28714 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
28716 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
28719 // Record which element was extracted.
28720 ExtractedElements |=
28721 1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
28723 Uses.push_back(Extract);
28726 // If not all the elements were used, this may not be worthwhile.
28727 if (ExtractedElements != 15)
28730 // Ok, we've now decided to do the transformation.
28731 // If 64-bit shifts are legal, use the extract-shift sequence,
28732 // otherwise bounce the vector off the cache.
28733 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28736 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
28737 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
28738 auto &DL = DAG.getDataLayout();
28739 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
28740 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28741 DAG.getConstant(0, dl, VecIdxTy));
28742 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
28743 DAG.getConstant(1, dl, VecIdxTy));
28745 SDValue ShAmt = DAG.getConstant(
28746 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
28747 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
28748 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28749 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
28750 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
28751 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
28752 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
28754 // Store the value to a temporary stack slot.
28755 SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
28756 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
28757 MachinePointerInfo());
28759 EVT ElementType = InputVector.getValueType().getVectorElementType();
28760 unsigned EltSize = ElementType.getSizeInBits() / 8;
28762 // Replace each use (extract) with a load of the appropriate element.
28763 for (unsigned i = 0; i < 4; ++i) {
28764 uint64_t Offset = EltSize * i;
28765 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
28766 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
28768 SDValue ScalarAddr =
28769 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
28771 // Load the scalar.
28773 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
28777 // Replace the extracts
28778 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
28779 UE = Uses.end(); UI != UE; ++UI) {
28780 SDNode *Extract = *UI;
28782 SDValue Idx = Extract->getOperand(1);
28783 uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
28784 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
28787 // The replacement was made in place; don't return anything.
28791 /// If a vector select has an operand that is -1 or 0, simplify the select to a
28792 /// bitwise logic operation.
28793 static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
28794 const X86Subtarget &Subtarget) {
28795 SDValue Cond = N->getOperand(0);
28796 SDValue LHS = N->getOperand(1);
28797 SDValue RHS = N->getOperand(2);
28798 EVT VT = LHS.getValueType();
28799 EVT CondVT = Cond.getValueType();
28801 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
28803 if (N->getOpcode() != ISD::VSELECT)
28806 assert(CondVT.isVector() && "Vector select expects a vector selector!");
28808 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28809 // Check if the first operand is all zeros and Cond type is vXi1.
28810 // This situation only applies to avx512.
28811 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
28812 CondVT.getVectorElementType() == MVT::i1) {
28813 //Invert the cond to not(cond) : xor(op,allones)=not(op)
28814 SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28815 DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
28817 //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
28818 return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
28821 // To use the condition operand as a bitwise mask, it must have elements that
28822 // are the same size as the select elements. Ie, the condition operand must
28823 // have already been promoted from the IR select condition type <N x i1>.
28824 // Don't check if the types themselves are equal because that excludes
28825 // vector floating-point selects.
28826 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
28829 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
28830 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
28832 // Try to invert the condition if true value is not all 1s and false value is
28834 if (!TValIsAllOnes && !FValIsAllZeros &&
28835 // Check if the selector will be produced by CMPP*/PCMP*.
28836 Cond.getOpcode() == ISD::SETCC &&
28837 // Check if SETCC has already been promoted.
28838 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
28840 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
28841 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
28843 if (TValIsAllZeros || FValIsAllOnes) {
28844 SDValue CC = Cond.getOperand(2);
28845 ISD::CondCode NewCC =
28846 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
28847 Cond.getOperand(0).getValueType().isInteger());
28848 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
28850 std::swap(LHS, RHS);
28851 TValIsAllOnes = FValIsAllOnes;
28852 FValIsAllZeros = TValIsAllZeros;
28856 if (!TValIsAllOnes && !FValIsAllZeros)
28860 if (TValIsAllOnes && FValIsAllZeros)
28862 else if (TValIsAllOnes)
28863 Ret = DAG.getNode(ISD::OR, DL, CondVT, Cond, DAG.getBitcast(CondVT, RHS));
28864 else if (FValIsAllZeros)
28865 Ret = DAG.getNode(ISD::AND, DL, CondVT, Cond, DAG.getBitcast(CondVT, LHS));
28867 return DAG.getBitcast(VT, Ret);
28870 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
28871 SDValue Cond = N->getOperand(0);
28872 SDValue LHS = N->getOperand(1);
28873 SDValue RHS = N->getOperand(2);
28876 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
28877 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
28878 if (!TrueC || !FalseC)
28881 // Don't do this for crazy integer types.
28882 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
28885 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
28886 // so that TrueC (the true value) is larger than FalseC.
28887 bool NeedsCondInvert = false;
28888 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
28889 // Efficiently invertible.
28890 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
28891 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
28892 isa<ConstantSDNode>(Cond.getOperand(1))))) {
28893 NeedsCondInvert = true;
28894 std::swap(TrueC, FalseC);
28897 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
28898 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
28899 if (NeedsCondInvert) // Invert the condition if needed.
28900 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28901 DAG.getConstant(1, DL, Cond.getValueType()));
28903 // Zero extend the condition if needed.
28904 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
28906 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
28907 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
28908 DAG.getConstant(ShAmt, DL, MVT::i8));
28911 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
28912 if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
28913 if (NeedsCondInvert) // Invert the condition if needed.
28914 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28915 DAG.getConstant(1, DL, Cond.getValueType()));
28917 // Zero extend the condition if needed.
28918 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28919 return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28920 SDValue(FalseC, 0));
28923 // Optimize cases that will turn into an LEA instruction. This requires
28924 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
28925 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
28926 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
28927 if (N->getValueType(0) == MVT::i32)
28928 Diff = (unsigned)Diff;
28930 bool isFastMultiplier = false;
28932 switch ((unsigned char)Diff) {
28935 case 1: // result = add base, cond
28936 case 2: // result = lea base( , cond*2)
28937 case 3: // result = lea base(cond, cond*2)
28938 case 4: // result = lea base( , cond*4)
28939 case 5: // result = lea base(cond, cond*4)
28940 case 8: // result = lea base( , cond*8)
28941 case 9: // result = lea base(cond, cond*8)
28942 isFastMultiplier = true;
28947 if (isFastMultiplier) {
28948 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
28949 if (NeedsCondInvert) // Invert the condition if needed.
28950 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
28951 DAG.getConstant(1, DL, Cond.getValueType()));
28953 // Zero extend the condition if needed.
28954 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
28955 // Scale the condition by the difference.
28957 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
28958 DAG.getConstant(Diff, DL, Cond.getValueType()));
28960 // Add the base if non-zero.
28961 if (FalseC->getAPIntValue() != 0)
28962 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
28963 SDValue(FalseC, 0));
28971 // If this is a bitcasted op that can be represented as another type, push the
28972 // the bitcast to the inputs. This allows more opportunities for pattern
28973 // matching masked instructions. This is called when we know that the operation
28974 // is used as one of the inputs of a vselect.
28975 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
28976 TargetLowering::DAGCombinerInfo &DCI) {
28977 // Make sure we have a bitcast.
28978 if (OrigOp.getOpcode() != ISD::BITCAST)
28981 SDValue Op = OrigOp.getOperand(0);
28983 // If the operation is used by anything other than the bitcast, we shouldn't
28984 // do this combine as that would replicate the operation.
28985 if (!Op.hasOneUse())
28988 MVT VT = OrigOp.getSimpleValueType();
28989 MVT EltVT = VT.getVectorElementType();
28990 SDLoc DL(Op.getNode());
28992 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
28994 Op0 = DAG.getBitcast(VT, Op0);
28995 DCI.AddToWorklist(Op0.getNode());
28996 Op1 = DAG.getBitcast(VT, Op1);
28997 DCI.AddToWorklist(Op1.getNode());
28998 DCI.CombineTo(OrigOp.getNode(),
28999 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
29003 unsigned Opcode = Op.getOpcode();
29005 case X86ISD::PALIGNR:
29006 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
29007 if (!VT.is128BitVector())
29009 Opcode = X86ISD::VALIGN;
29011 case X86ISD::VALIGN: {
29012 if (EltVT != MVT::i32 && EltVT != MVT::i64)
29014 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29015 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29016 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
29017 unsigned EltSize = EltVT.getSizeInBits();
29018 // Make sure we can represent the same shift with the new VT.
29019 if ((ShiftAmt % EltSize) != 0)
29021 Imm = ShiftAmt / EltSize;
29022 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29023 DAG.getConstant(Imm, DL, MVT::i8));
29025 case X86ISD::SHUF128: {
29026 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
29028 // Only change element size, not type.
29029 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
29031 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
29034 case ISD::INSERT_SUBVECTOR: {
29035 unsigned EltSize = EltVT.getSizeInBits();
29036 if (EltSize != 32 && EltSize != 64)
29038 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29039 // Only change element size, not type.
29040 if (VT.isInteger() != OpEltVT.isInteger())
29042 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
29043 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29044 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
29045 DCI.AddToWorklist(Op0.getNode());
29046 // Op1 needs to be bitcasted to a smaller vector with the same element type.
29047 SDValue Op1 = Op.getOperand(1);
29048 MVT Op1VT = MVT::getVectorVT(EltVT,
29049 Op1.getSimpleValueType().getSizeInBits() / EltSize);
29050 Op1 = DAG.getBitcast(Op1VT, Op1);
29051 DCI.AddToWorklist(Op1.getNode());
29052 DCI.CombineTo(OrigOp.getNode(),
29053 DAG.getNode(Opcode, DL, VT, Op0, Op1,
29054 DAG.getConstant(Imm, DL, MVT::i8)));
29057 case ISD::EXTRACT_SUBVECTOR: {
29058 unsigned EltSize = EltVT.getSizeInBits();
29059 if (EltSize != 32 && EltSize != 64)
29061 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
29062 // Only change element size, not type.
29063 if (VT.isInteger() != OpEltVT.isInteger())
29065 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
29066 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
29067 // Op0 needs to be bitcasted to a larger vector with the same element type.
29068 SDValue Op0 = Op.getOperand(0);
29069 MVT Op0VT = MVT::getVectorVT(EltVT,
29070 Op0.getSimpleValueType().getSizeInBits() / EltSize);
29071 Op0 = DAG.getBitcast(Op0VT, Op0);
29072 DCI.AddToWorklist(Op0.getNode());
29073 DCI.CombineTo(OrigOp.getNode(),
29074 DAG.getNode(Opcode, DL, VT, Op0,
29075 DAG.getConstant(Imm, DL, MVT::i8)));
29083 /// Do target-specific dag combines on SELECT and VSELECT nodes.
29084 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
29085 TargetLowering::DAGCombinerInfo &DCI,
29086 const X86Subtarget &Subtarget) {
29088 SDValue Cond = N->getOperand(0);
29089 // Get the LHS/RHS of the select.
29090 SDValue LHS = N->getOperand(1);
29091 SDValue RHS = N->getOperand(2);
29092 EVT VT = LHS.getValueType();
29093 EVT CondVT = Cond.getValueType();
29094 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29096 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
29097 // instructions match the semantics of the common C idiom x<y?x:y but not
29098 // x<=y?x:y, because of how they handle negative zero (which can be
29099 // ignored in unsafe-math mode).
29100 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
29101 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
29102 VT != MVT::f80 && VT != MVT::f128 &&
29103 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
29104 (Subtarget.hasSSE2() ||
29105 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
29106 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29108 unsigned Opcode = 0;
29109 // Check for x CC y ? x : y.
29110 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29111 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29115 // Converting this to a min would handle NaNs incorrectly, and swapping
29116 // the operands would cause it to handle comparisons between positive
29117 // and negative zero incorrectly.
29118 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29119 if (!DAG.getTarget().Options.UnsafeFPMath &&
29120 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29122 std::swap(LHS, RHS);
29124 Opcode = X86ISD::FMIN;
29127 // Converting this to a min would handle comparisons between positive
29128 // and negative zero incorrectly.
29129 if (!DAG.getTarget().Options.UnsafeFPMath &&
29130 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29132 Opcode = X86ISD::FMIN;
29135 // Converting this to a min would handle both negative zeros and NaNs
29136 // incorrectly, but we can swap the operands to fix both.
29137 std::swap(LHS, RHS);
29141 Opcode = X86ISD::FMIN;
29145 // Converting this to a max would handle comparisons between positive
29146 // and negative zero incorrectly.
29147 if (!DAG.getTarget().Options.UnsafeFPMath &&
29148 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
29150 Opcode = X86ISD::FMAX;
29153 // Converting this to a max would handle NaNs incorrectly, and swapping
29154 // the operands would cause it to handle comparisons between positive
29155 // and negative zero incorrectly.
29156 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
29157 if (!DAG.getTarget().Options.UnsafeFPMath &&
29158 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
29160 std::swap(LHS, RHS);
29162 Opcode = X86ISD::FMAX;
29165 // Converting this to a max would handle both negative zeros and NaNs
29166 // incorrectly, but we can swap the operands to fix both.
29167 std::swap(LHS, RHS);
29171 Opcode = X86ISD::FMAX;
29174 // Check for x CC y ? y : x -- a min/max with reversed arms.
29175 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
29176 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
29180 // Converting this to a min would handle comparisons between positive
29181 // and negative zero incorrectly, and swapping the operands would
29182 // cause it to handle NaNs incorrectly.
29183 if (!DAG.getTarget().Options.UnsafeFPMath &&
29184 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
29185 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29187 std::swap(LHS, RHS);
29189 Opcode = X86ISD::FMIN;
29192 // Converting this to a min would handle NaNs incorrectly.
29193 if (!DAG.getTarget().Options.UnsafeFPMath &&
29194 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
29196 Opcode = X86ISD::FMIN;
29199 // Converting this to a min would handle both negative zeros and NaNs
29200 // incorrectly, but we can swap the operands to fix both.
29201 std::swap(LHS, RHS);
29205 Opcode = X86ISD::FMIN;
29209 // Converting this to a max would handle NaNs incorrectly.
29210 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29212 Opcode = X86ISD::FMAX;
29215 // Converting this to a max would handle comparisons between positive
29216 // and negative zero incorrectly, and swapping the operands would
29217 // cause it to handle NaNs incorrectly.
29218 if (!DAG.getTarget().Options.UnsafeFPMath &&
29219 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
29220 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
29222 std::swap(LHS, RHS);
29224 Opcode = X86ISD::FMAX;
29227 // Converting this to a max would handle both negative zeros and NaNs
29228 // incorrectly, but we can swap the operands to fix both.
29229 std::swap(LHS, RHS);
29233 Opcode = X86ISD::FMAX;
29239 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
29242 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
29243 // lowering on KNL. In this case we convert it to
29244 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
29245 // The same situation for all 128 and 256-bit vectors of i8 and i16.
29246 // Since SKX these selects have a proper lowering.
29247 if (Subtarget.hasAVX512() && CondVT.isVector() &&
29248 CondVT.getVectorElementType() == MVT::i1 &&
29249 (VT.is128BitVector() || VT.is256BitVector()) &&
29250 (VT.getVectorElementType() == MVT::i8 ||
29251 VT.getVectorElementType() == MVT::i16) &&
29252 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
29253 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
29254 DCI.AddToWorklist(Cond.getNode());
29255 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
29258 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
29261 // Canonicalize max and min:
29262 // (x > y) ? x : y -> (x >= y) ? x : y
29263 // (x < y) ? x : y -> (x <= y) ? x : y
29264 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
29265 // the need for an extra compare
29266 // against zero. e.g.
29267 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
29269 // testl %edi, %edi
29271 // cmovgl %edi, %eax
29275 // cmovsl %eax, %edi
29276 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
29277 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
29278 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
29279 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29284 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
29285 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
29286 Cond.getOperand(0), Cond.getOperand(1), NewCC);
29287 return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS);
29292 // Early exit check
29293 if (!TLI.isTypeLegal(VT))
29296 // Match VSELECTs into subs with unsigned saturation.
29297 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
29298 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
29299 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
29300 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
29301 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
29303 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
29304 // left side invert the predicate to simplify logic below.
29306 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
29308 CC = ISD::getSetCCInverse(CC, true);
29309 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
29313 if (Other.getNode() && Other->getNumOperands() == 2 &&
29314 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
29315 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
29316 SDValue CondRHS = Cond->getOperand(1);
29318 // Look for a general sub with unsigned saturation first.
29319 // x >= y ? x-y : 0 --> subus x, y
29320 // x > y ? x-y : 0 --> subus x, y
29321 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
29322 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
29323 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
29325 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
29326 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
29327 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
29328 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
29329 // If the RHS is a constant we have to reverse the const
29330 // canonicalization.
29331 // x > C-1 ? x+-C : 0 --> subus x, C
29332 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
29333 CondRHSConst->getAPIntValue() ==
29334 (-OpRHSConst->getAPIntValue() - 1))
29335 return DAG.getNode(
29336 X86ISD::SUBUS, DL, VT, OpLHS,
29337 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
29339 // Another special case: If C was a sign bit, the sub has been
29340 // canonicalized into a xor.
29341 // FIXME: Would it be better to use computeKnownBits to determine
29342 // whether it's safe to decanonicalize the xor?
29343 // x s< 0 ? x^C : 0 --> subus x, C
29344 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
29345 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
29346 OpRHSConst->getAPIntValue().isSignBit())
29347 // Note that we have to rebuild the RHS constant here to ensure we
29348 // don't rely on particular values of undef lanes.
29349 return DAG.getNode(
29350 X86ISD::SUBUS, DL, VT, OpLHS,
29351 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
29356 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, Subtarget))
29359 // If this is a *dynamic* select (non-constant condition) and we can match
29360 // this node with one of the variable blend instructions, restructure the
29361 // condition so that the blends can use the high bit of each element and use
29362 // SimplifyDemandedBits to simplify the condition operand.
29363 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
29364 !DCI.isBeforeLegalize() &&
29365 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
29366 unsigned BitWidth = Cond.getScalarValueSizeInBits();
29368 // Don't optimize vector selects that map to mask-registers.
29372 // We can only handle the cases where VSELECT is directly legal on the
29373 // subtarget. We custom lower VSELECT nodes with constant conditions and
29374 // this makes it hard to see whether a dynamic VSELECT will correctly
29375 // lower, so we both check the operation's status and explicitly handle the
29376 // cases where a *dynamic* blend will fail even though a constant-condition
29377 // blend could be custom lowered.
29378 // FIXME: We should find a better way to handle this class of problems.
29379 // Potentially, we should combine constant-condition vselect nodes
29380 // pre-legalization into shuffles and not mark as many types as custom
29382 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
29384 // FIXME: We don't support i16-element blends currently. We could and
29385 // should support them by making *all* the bits in the condition be set
29386 // rather than just the high bit and using an i8-element blend.
29387 if (VT.getVectorElementType() == MVT::i16)
29389 // Dynamic blending was only available from SSE4.1 onward.
29390 if (VT.is128BitVector() && !Subtarget.hasSSE41())
29392 // Byte blends are only available in AVX2
29393 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
29396 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
29397 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
29399 APInt KnownZero, KnownOne;
29400 TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
29401 DCI.isBeforeLegalizeOps());
29402 if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
29403 TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
29405 // If we changed the computation somewhere in the DAG, this change
29406 // will affect all users of Cond.
29407 // Make sure it is fine and update all the nodes so that we do not
29408 // use the generic VSELECT anymore. Otherwise, we may perform
29409 // wrong optimizations as we messed up with the actual expectation
29410 // for the vector boolean values.
29411 if (Cond != TLO.Old) {
29412 // Check all uses of that condition operand to check whether it will be
29413 // consumed by non-BLEND instructions, which may depend on all bits are
29415 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29417 if (I->getOpcode() != ISD::VSELECT)
29418 // TODO: Add other opcodes eventually lowered into BLEND.
29421 // Update all the users of the condition, before committing the change,
29422 // so that the VSELECT optimizations that expect the correct vector
29423 // boolean value will not be triggered.
29424 for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
29426 DAG.ReplaceAllUsesOfValueWith(
29428 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
29429 Cond, I->getOperand(1), I->getOperand(2)));
29430 DCI.CommitTargetLoweringOpt(TLO);
29433 // At this point, only Cond is changed. Change the condition
29434 // just for N to keep the opportunity to optimize all other
29435 // users their own way.
29436 DAG.ReplaceAllUsesOfValueWith(
29438 DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
29439 TLO.New, N->getOperand(1), N->getOperand(2)));
29444 // Look for vselects with LHS/RHS being bitcasted from an operation that
29445 // can be executed on another type. Push the bitcast to the inputs of
29446 // the operation. This exposes opportunities for using masking instructions.
29447 if (N->getOpcode() == ISD::VSELECT && !DCI.isBeforeLegalizeOps() &&
29448 CondVT.getVectorElementType() == MVT::i1) {
29449 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
29450 return SDValue(N, 0);
29451 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
29452 return SDValue(N, 0);
29458 /// Combine brcond/cmov/setcc/.. based on comparing the result of
29459 /// atomic_load_add to use EFLAGS produced by the addition
29460 /// directly if possible. For example:
29462 /// (setcc (cmp (atomic_load_add x, -C) C), COND_E)
29464 /// (setcc (LADD x, -C), COND_E)
29467 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
29469 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
29471 /// Note that this is only legal for some op/cc combinations.
29472 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
29473 SelectionDAG &DAG) {
29474 // This combine only operates on CMP-like nodes.
29475 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29476 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29479 // Can't replace the cmp if it has more uses than the one we're looking at.
29480 // FIXME: We would like to be able to handle this, but would need to make sure
29481 // all uses were updated.
29482 if (!Cmp.hasOneUse())
29485 // This applies to variations of the common case:
29486 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
29487 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
29488 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
29489 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
29490 // Using the proper condcodes (see below), overflow is checked for.
29492 // FIXME: We can generalize both constraints:
29493 // - XOR/OR/AND (if they were made to survive AtomicExpand)
29495 // if the result is compared.
29497 SDValue CmpLHS = Cmp.getOperand(0);
29498 SDValue CmpRHS = Cmp.getOperand(1);
29500 if (!CmpLHS.hasOneUse())
29503 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
29506 APInt Comparand = CmpRHSC->getAPIntValue();
29508 const unsigned Opc = CmpLHS.getOpcode();
29510 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
29513 SDValue OpRHS = CmpLHS.getOperand(2);
29514 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
29518 APInt Addend = OpRHSC->getAPIntValue();
29519 if (Opc == ISD::ATOMIC_LOAD_SUB)
29522 if (Comparand == -Addend) {
29523 // No change to CC.
29524 } else if (CC == X86::COND_S && Comparand == 0 && Addend == 1) {
29526 } else if (CC == X86::COND_NS && Comparand == 0 && Addend == 1) {
29528 } else if (CC == X86::COND_G && Comparand == 0 && Addend == -1) {
29530 } else if (CC == X86::COND_LE && Comparand == 0 && Addend == -1) {
29536 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
29537 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
29538 DAG.getUNDEF(CmpLHS.getValueType()));
29539 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
29543 // Check whether a boolean test is testing a boolean value generated by
29544 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
29547 // Simplify the following patterns:
29548 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
29549 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
29550 // to (Op EFLAGS Cond)
29552 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
29553 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
29554 // to (Op EFLAGS !Cond)
29556 // where Op could be BRCOND or CMOV.
29558 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
29559 // This combine only operates on CMP-like nodes.
29560 if (!(Cmp.getOpcode() == X86ISD::CMP ||
29561 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
29564 // Quit if not used as a boolean value.
29565 if (CC != X86::COND_E && CC != X86::COND_NE)
29568 // Check CMP operands. One of them should be 0 or 1 and the other should be
29569 // an SetCC or extended from it.
29570 SDValue Op1 = Cmp.getOperand(0);
29571 SDValue Op2 = Cmp.getOperand(1);
29574 const ConstantSDNode* C = nullptr;
29575 bool needOppositeCond = (CC == X86::COND_E);
29576 bool checkAgainstTrue = false; // Is it a comparison against 1?
29578 if ((C = dyn_cast<ConstantSDNode>(Op1)))
29580 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
29582 else // Quit if all operands are not constants.
29585 if (C->getZExtValue() == 1) {
29586 needOppositeCond = !needOppositeCond;
29587 checkAgainstTrue = true;
29588 } else if (C->getZExtValue() != 0)
29589 // Quit if the constant is neither 0 or 1.
29592 bool truncatedToBoolWithAnd = false;
29593 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
29594 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
29595 SetCC.getOpcode() == ISD::TRUNCATE ||
29596 SetCC.getOpcode() == ISD::AND) {
29597 if (SetCC.getOpcode() == ISD::AND) {
29599 if (isOneConstant(SetCC.getOperand(0)))
29601 if (isOneConstant(SetCC.getOperand(1)))
29605 SetCC = SetCC.getOperand(OpIdx);
29606 truncatedToBoolWithAnd = true;
29608 SetCC = SetCC.getOperand(0);
29611 switch (SetCC.getOpcode()) {
29612 case X86ISD::SETCC_CARRY:
29613 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
29614 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
29615 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
29616 // truncated to i1 using 'and'.
29617 if (checkAgainstTrue && !truncatedToBoolWithAnd)
29619 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
29620 "Invalid use of SETCC_CARRY!");
29622 case X86ISD::SETCC:
29623 // Set the condition code or opposite one if necessary.
29624 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
29625 if (needOppositeCond)
29626 CC = X86::GetOppositeBranchCondition(CC);
29627 return SetCC.getOperand(1);
29628 case X86ISD::CMOV: {
29629 // Check whether false/true value has canonical one, i.e. 0 or 1.
29630 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
29631 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
29632 // Quit if true value is not a constant.
29635 // Quit if false value is not a constant.
29637 SDValue Op = SetCC.getOperand(0);
29638 // Skip 'zext' or 'trunc' node.
29639 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
29640 Op.getOpcode() == ISD::TRUNCATE)
29641 Op = Op.getOperand(0);
29642 // A special case for rdrand/rdseed, where 0 is set if false cond is
29644 if ((Op.getOpcode() != X86ISD::RDRAND &&
29645 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
29648 // Quit if false value is not the constant 0 or 1.
29649 bool FValIsFalse = true;
29650 if (FVal && FVal->getZExtValue() != 0) {
29651 if (FVal->getZExtValue() != 1)
29653 // If FVal is 1, opposite cond is needed.
29654 needOppositeCond = !needOppositeCond;
29655 FValIsFalse = false;
29657 // Quit if TVal is not the constant opposite of FVal.
29658 if (FValIsFalse && TVal->getZExtValue() != 1)
29660 if (!FValIsFalse && TVal->getZExtValue() != 0)
29662 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
29663 if (needOppositeCond)
29664 CC = X86::GetOppositeBranchCondition(CC);
29665 return SetCC.getOperand(3);
29672 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
29674 /// (X86or (X86setcc) (X86setcc))
29675 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
29676 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
29677 X86::CondCode &CC1, SDValue &Flags,
29679 if (Cond->getOpcode() == X86ISD::CMP) {
29680 if (!isNullConstant(Cond->getOperand(1)))
29683 Cond = Cond->getOperand(0);
29688 SDValue SetCC0, SetCC1;
29689 switch (Cond->getOpcode()) {
29690 default: return false;
29697 SetCC0 = Cond->getOperand(0);
29698 SetCC1 = Cond->getOperand(1);
29702 // Make sure we have SETCC nodes, using the same flags value.
29703 if (SetCC0.getOpcode() != X86ISD::SETCC ||
29704 SetCC1.getOpcode() != X86ISD::SETCC ||
29705 SetCC0->getOperand(1) != SetCC1->getOperand(1))
29708 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
29709 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
29710 Flags = SetCC0->getOperand(1);
29714 /// Optimize an EFLAGS definition used according to the condition code \p CC
29715 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
29716 /// uses of chain values.
29717 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
29718 SelectionDAG &DAG) {
29719 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
29721 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
29724 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
29725 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
29726 TargetLowering::DAGCombinerInfo &DCI,
29727 const X86Subtarget &Subtarget) {
29730 // If the flag operand isn't dead, don't touch this CMOV.
29731 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
29734 SDValue FalseOp = N->getOperand(0);
29735 SDValue TrueOp = N->getOperand(1);
29736 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
29737 SDValue Cond = N->getOperand(3);
29739 if (CC == X86::COND_E || CC == X86::COND_NE) {
29740 switch (Cond.getOpcode()) {
29744 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
29745 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
29746 return (CC == X86::COND_E) ? FalseOp : TrueOp;
29750 // Try to simplify the EFLAGS and condition code operands.
29751 // We can't always do this as FCMOV only supports a subset of X86 cond.
29752 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
29753 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
29754 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
29756 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29760 // If this is a select between two integer constants, try to do some
29761 // optimizations. Note that the operands are ordered the opposite of SELECT
29763 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
29764 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
29765 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
29766 // larger than FalseC (the false value).
29767 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
29768 CC = X86::GetOppositeBranchCondition(CC);
29769 std::swap(TrueC, FalseC);
29770 std::swap(TrueOp, FalseOp);
29773 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
29774 // This is efficient for any integer data type (including i8/i16) and
29776 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
29777 Cond = getSETCC(CC, Cond, DL, DAG);
29779 // Zero extend the condition if needed.
29780 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
29782 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
29783 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
29784 DAG.getConstant(ShAmt, DL, MVT::i8));
29785 if (N->getNumValues() == 2) // Dead flag value?
29786 return DCI.CombineTo(N, Cond, SDValue());
29790 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
29791 // for any integer data type, including i8/i16.
29792 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
29793 Cond = getSETCC(CC, Cond, DL, DAG);
29795 // Zero extend the condition if needed.
29796 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
29797 FalseC->getValueType(0), Cond);
29798 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29799 SDValue(FalseC, 0));
29801 if (N->getNumValues() == 2) // Dead flag value?
29802 return DCI.CombineTo(N, Cond, SDValue());
29806 // Optimize cases that will turn into an LEA instruction. This requires
29807 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
29808 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
29809 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
29810 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
29812 bool isFastMultiplier = false;
29814 switch ((unsigned char)Diff) {
29816 case 1: // result = add base, cond
29817 case 2: // result = lea base( , cond*2)
29818 case 3: // result = lea base(cond, cond*2)
29819 case 4: // result = lea base( , cond*4)
29820 case 5: // result = lea base(cond, cond*4)
29821 case 8: // result = lea base( , cond*8)
29822 case 9: // result = lea base(cond, cond*8)
29823 isFastMultiplier = true;
29828 if (isFastMultiplier) {
29829 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
29830 Cond = getSETCC(CC, Cond, DL ,DAG);
29831 // Zero extend the condition if needed.
29832 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
29834 // Scale the condition by the difference.
29836 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
29837 DAG.getConstant(Diff, DL, Cond.getValueType()));
29839 // Add the base if non-zero.
29840 if (FalseC->getAPIntValue() != 0)
29841 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
29842 SDValue(FalseC, 0));
29843 if (N->getNumValues() == 2) // Dead flag value?
29844 return DCI.CombineTo(N, Cond, SDValue());
29851 // Handle these cases:
29852 // (select (x != c), e, c) -> select (x != c), e, x),
29853 // (select (x == c), c, e) -> select (x == c), x, e)
29854 // where the c is an integer constant, and the "select" is the combination
29855 // of CMOV and CMP.
29857 // The rationale for this change is that the conditional-move from a constant
29858 // needs two instructions, however, conditional-move from a register needs
29859 // only one instruction.
29861 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
29862 // some instruction-combining opportunities. This opt needs to be
29863 // postponed as late as possible.
29865 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
29866 // the DCI.xxxx conditions are provided to postpone the optimization as
29867 // late as possible.
29869 ConstantSDNode *CmpAgainst = nullptr;
29870 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
29871 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
29872 !isa<ConstantSDNode>(Cond.getOperand(0))) {
29874 if (CC == X86::COND_NE &&
29875 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
29876 CC = X86::GetOppositeBranchCondition(CC);
29877 std::swap(TrueOp, FalseOp);
29880 if (CC == X86::COND_E &&
29881 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
29882 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
29883 DAG.getConstant(CC, DL, MVT::i8), Cond };
29884 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
29889 // Fold and/or of setcc's to double CMOV:
29890 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
29891 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
29893 // This combine lets us generate:
29894 // cmovcc1 (jcc1 if we don't have CMOV)
29900 // cmovne (jne if we don't have CMOV)
29901 // When we can't use the CMOV instruction, it might increase branch
29903 // When we can use CMOV, or when there is no mispredict, this improves
29904 // throughput and reduces register pressure.
29906 if (CC == X86::COND_NE) {
29908 X86::CondCode CC0, CC1;
29910 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
29912 std::swap(FalseOp, TrueOp);
29913 CC0 = X86::GetOppositeBranchCondition(CC0);
29914 CC1 = X86::GetOppositeBranchCondition(CC1);
29917 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
29919 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
29920 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
29921 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
29922 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
29930 /// Different mul shrinking modes.
29931 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
29933 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
29934 EVT VT = N->getOperand(0).getValueType();
29935 if (VT.getScalarSizeInBits() != 32)
29938 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
29939 unsigned SignBits[2] = {1, 1};
29940 bool IsPositive[2] = {false, false};
29941 for (unsigned i = 0; i < 2; i++) {
29942 SDValue Opd = N->getOperand(i);
29944 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
29945 // compute signbits for it separately.
29946 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
29947 // For anyextend, it is safe to assume an appropriate number of leading
29949 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
29951 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
29956 IsPositive[i] = true;
29957 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
29958 // All the operands of BUILD_VECTOR need to be int constant.
29959 // Find the smallest value range which all the operands belong to.
29961 IsPositive[i] = true;
29962 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
29963 if (SubOp.isUndef())
29965 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
29968 APInt IntVal = CN->getAPIntValue();
29969 if (IntVal.isNegative())
29970 IsPositive[i] = false;
29971 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
29974 SignBits[i] = DAG.ComputeNumSignBits(Opd);
29975 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
29976 IsPositive[i] = true;
29980 bool AllPositive = IsPositive[0] && IsPositive[1];
29981 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
29982 // When ranges are from -128 ~ 127, use MULS8 mode.
29983 if (MinSignBits >= 25)
29985 // When ranges are from 0 ~ 255, use MULU8 mode.
29986 else if (AllPositive && MinSignBits >= 24)
29988 // When ranges are from -32768 ~ 32767, use MULS16 mode.
29989 else if (MinSignBits >= 17)
29991 // When ranges are from 0 ~ 65535, use MULU16 mode.
29992 else if (AllPositive && MinSignBits >= 16)
29999 /// When the operands of vector mul are extended from smaller size values,
30000 /// like i8 and i16, the type of mul may be shrinked to generate more
30001 /// efficient code. Two typical patterns are handled:
30003 /// %2 = sext/zext <N x i8> %1 to <N x i32>
30004 /// %4 = sext/zext <N x i8> %3 to <N x i32>
30005 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30006 /// %5 = mul <N x i32> %2, %4
30009 /// %2 = zext/sext <N x i16> %1 to <N x i32>
30010 /// %4 = zext/sext <N x i16> %3 to <N x i32>
30011 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
30012 /// %5 = mul <N x i32> %2, %4
30014 /// There are four mul shrinking modes:
30015 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
30016 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
30017 /// generate pmullw+sext32 for it (MULS8 mode).
30018 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
30019 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
30020 /// generate pmullw+zext32 for it (MULU8 mode).
30021 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
30022 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
30023 /// generate pmullw+pmulhw for it (MULS16 mode).
30024 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
30025 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
30026 /// generate pmullw+pmulhuw for it (MULU16 mode).
30027 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
30028 const X86Subtarget &Subtarget) {
30029 // Check for legality
30030 // pmullw/pmulhw are not supported by SSE.
30031 if (!Subtarget.hasSSE2())
30034 // Check for profitability
30035 // pmulld is supported since SSE41. It is better to use pmulld
30036 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
30038 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
30039 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
30043 if (!canReduceVMulWidth(N, DAG, Mode))
30047 SDValue N0 = N->getOperand(0);
30048 SDValue N1 = N->getOperand(1);
30049 EVT VT = N->getOperand(0).getValueType();
30050 unsigned RegSize = 128;
30051 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
30053 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
30054 // Shrink the operands of mul.
30055 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
30056 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
30058 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
30059 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
30060 // lower part is needed.
30061 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
30062 if (Mode == MULU8 || Mode == MULS8) {
30063 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
30066 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
30067 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
30068 // the higher part is also needed.
30069 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30070 ReducedVT, NewN0, NewN1);
30072 // Repack the lower part and higher part result of mul into a wider
30074 // Generate shuffle functioning as punpcklwd.
30075 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
30076 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30077 ShuffleMask[2 * i] = i;
30078 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
30081 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30082 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
30083 // Generate shuffle functioning as punpckhwd.
30084 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
30085 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
30086 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
30089 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
30090 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
30091 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
30094 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
30095 // to legalize the mul explicitly because implicit legalization for type
30096 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
30097 // instructions which will not exist when we explicitly legalize it by
30098 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
30099 // <4 x i16> undef).
30101 // Legalize the operands of mul.
30102 // FIXME: We may be able to handle non-concatenated vectors by insertion.
30103 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
30104 if ((RegSize % ReducedSizeInBits) != 0)
30107 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
30108 DAG.getUNDEF(ReducedVT));
30110 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30112 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
30114 if (Mode == MULU8 || Mode == MULS8) {
30115 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
30117 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30119 // convert the type of mul result to VT.
30120 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30121 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
30122 : ISD::SIGN_EXTEND_VECTOR_INREG,
30124 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30125 DAG.getIntPtrConstant(0, DL));
30127 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
30128 // MULU16/MULS16, both parts are needed.
30129 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
30130 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
30131 OpsVT, NewN0, NewN1);
30133 // Repack the lower part and higher part result of mul into a wider
30134 // result. Make sure the type of mul result is VT.
30135 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
30136 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
30137 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
30138 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
30139 DAG.getIntPtrConstant(0, DL));
30144 /// Optimize a single multiply with constant into two operations in order to
30145 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
30146 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
30147 TargetLowering::DAGCombinerInfo &DCI,
30148 const X86Subtarget &Subtarget) {
30149 EVT VT = N->getValueType(0);
30150 if (DCI.isBeforeLegalize() && VT.isVector())
30151 return reduceVMULWidth(N, DAG, Subtarget);
30153 // An imul is usually smaller than the alternative sequence.
30154 if (DAG.getMachineFunction().getFunction()->optForMinSize())
30157 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
30160 if (VT != MVT::i64 && VT != MVT::i32)
30163 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
30166 uint64_t MulAmt = C->getZExtValue();
30167 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
30170 uint64_t MulAmt1 = 0;
30171 uint64_t MulAmt2 = 0;
30172 if ((MulAmt % 9) == 0) {
30174 MulAmt2 = MulAmt / 9;
30175 } else if ((MulAmt % 5) == 0) {
30177 MulAmt2 = MulAmt / 5;
30178 } else if ((MulAmt % 3) == 0) {
30180 MulAmt2 = MulAmt / 3;
30186 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
30188 if (isPowerOf2_64(MulAmt2) &&
30189 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
30190 // If second multiplifer is pow2, issue it first. We want the multiply by
30191 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
30193 std::swap(MulAmt1, MulAmt2);
30195 if (isPowerOf2_64(MulAmt1))
30196 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30197 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
30199 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
30200 DAG.getConstant(MulAmt1, DL, VT));
30202 if (isPowerOf2_64(MulAmt2))
30203 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
30204 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
30206 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
30207 DAG.getConstant(MulAmt2, DL, VT));
30211 assert(MulAmt != 0 && MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX)
30212 && "Both cases that could cause potential overflows should have "
30213 "already been handled.");
30214 if (isPowerOf2_64(MulAmt - 1))
30215 // (mul x, 2^N + 1) => (add (shl x, N), x)
30216 NewMul = DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
30217 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
30218 DAG.getConstant(Log2_64(MulAmt - 1), DL,
30221 else if (isPowerOf2_64(MulAmt + 1))
30222 // (mul x, 2^N - 1) => (sub (shl x, N), x)
30223 NewMul = DAG.getNode(ISD::SUB, DL, VT, DAG.getNode(ISD::SHL, DL, VT,
30225 DAG.getConstant(Log2_64(MulAmt + 1),
30226 DL, MVT::i8)), N->getOperand(0));
30230 // Do not add new nodes to DAG combiner worklist.
30231 DCI.CombineTo(N, NewMul, false);
30236 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
30237 SDValue N0 = N->getOperand(0);
30238 SDValue N1 = N->getOperand(1);
30239 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
30240 EVT VT = N0.getValueType();
30242 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
30243 // since the result of setcc_c is all zero's or all ones.
30244 if (VT.isInteger() && !VT.isVector() &&
30245 N1C && N0.getOpcode() == ISD::AND &&
30246 N0.getOperand(1).getOpcode() == ISD::Constant) {
30247 SDValue N00 = N0.getOperand(0);
30248 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
30249 const APInt &ShAmt = N1C->getAPIntValue();
30250 Mask = Mask.shl(ShAmt);
30251 bool MaskOK = false;
30252 // We can handle cases concerning bit-widening nodes containing setcc_c if
30253 // we carefully interrogate the mask to make sure we are semantics
30255 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
30256 // of the underlying setcc_c operation if the setcc_c was zero extended.
30257 // Consider the following example:
30258 // zext(setcc_c) -> i32 0x0000FFFF
30259 // c1 -> i32 0x0000FFFF
30260 // c2 -> i32 0x00000001
30261 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
30262 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
30263 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
30265 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
30266 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
30268 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
30269 N00.getOpcode() == ISD::ANY_EXTEND) &&
30270 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
30271 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
30273 if (MaskOK && Mask != 0) {
30275 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
30279 // Hardware support for vector shifts is sparse which makes us scalarize the
30280 // vector operations in many cases. Also, on sandybridge ADD is faster than
30282 // (shl V, 1) -> add V,V
30283 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
30284 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
30285 assert(N0.getValueType().isVector() && "Invalid vector shift type");
30286 // We shift all of the values by one. In many cases we do not have
30287 // hardware support for this operation. This is better expressed as an ADD
30289 if (N1SplatC->getAPIntValue() == 1)
30290 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
30296 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
30297 SDValue N0 = N->getOperand(0);
30298 SDValue N1 = N->getOperand(1);
30299 EVT VT = N0.getValueType();
30300 unsigned Size = VT.getSizeInBits();
30302 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
30303 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
30304 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
30305 // depending on sign of (SarConst - [56,48,32,24,16])
30307 // sexts in X86 are MOVs. The MOVs have the same code size
30308 // as above SHIFTs (only SHIFT on 1 has lower code size).
30309 // However the MOVs have 2 advantages to a SHIFT:
30310 // 1. MOVs can write to a register that differs from source
30311 // 2. MOVs accept memory operands
30313 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
30314 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
30315 N0.getOperand(1).getOpcode() != ISD::Constant)
30318 SDValue N00 = N0.getOperand(0);
30319 SDValue N01 = N0.getOperand(1);
30320 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
30321 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
30322 EVT CVT = N1.getValueType();
30324 if (SarConst.isNegative())
30327 for (MVT SVT : MVT::integer_valuetypes()) {
30328 unsigned ShiftSize = SVT.getSizeInBits();
30329 // skipping types without corresponding sext/zext and
30330 // ShlConst that is not one of [56,48,32,24,16]
30331 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
30335 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
30336 SarConst = SarConst - (Size - ShiftSize);
30339 else if (SarConst.isNegative())
30340 return DAG.getNode(ISD::SHL, DL, VT, NN,
30341 DAG.getConstant(-SarConst, DL, CVT));
30343 return DAG.getNode(ISD::SRA, DL, VT, NN,
30344 DAG.getConstant(SarConst, DL, CVT));
30349 /// \brief Returns a vector of 0s if the node in input is a vector logical
30350 /// shift by a constant amount which is known to be bigger than or equal
30351 /// to the vector element size in bits.
30352 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
30353 const X86Subtarget &Subtarget) {
30354 EVT VT = N->getValueType(0);
30356 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
30357 (!Subtarget.hasInt256() ||
30358 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
30361 SDValue Amt = N->getOperand(1);
30363 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
30364 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
30365 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
30366 unsigned MaxAmount =
30367 VT.getSimpleVT().getScalarSizeInBits();
30369 // SSE2/AVX2 logical shifts always return a vector of 0s
30370 // if the shift amount is bigger than or equal to
30371 // the element size. The constant shift amount will be
30372 // encoded as a 8-bit immediate.
30373 if (ShiftAmt.trunc(8).uge(MaxAmount))
30374 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
30380 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
30381 TargetLowering::DAGCombinerInfo &DCI,
30382 const X86Subtarget &Subtarget) {
30383 if (N->getOpcode() == ISD::SHL)
30384 if (SDValue V = combineShiftLeft(N, DAG))
30387 if (N->getOpcode() == ISD::SRA)
30388 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
30391 // Try to fold this logical shift into a zero vector.
30392 if (N->getOpcode() != ISD::SRA)
30393 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
30399 static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
30400 TargetLowering::DAGCombinerInfo &DCI,
30401 const X86Subtarget &Subtarget) {
30402 assert((X86ISD::VSHLI == N->getOpcode() || X86ISD::VSRLI == N->getOpcode()) &&
30403 "Unexpected opcode");
30404 EVT VT = N->getValueType(0);
30405 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
30407 // This fails for mask register (vXi1) shifts.
30408 if ((NumBitsPerElt % 8) != 0)
30411 // Out of range logical bit shifts are guaranteed to be zero.
30412 APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
30413 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt))
30414 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30416 // Shift N0 by zero -> N0.
30418 return N->getOperand(0);
30420 // Shift zero -> zero.
30421 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
30422 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
30424 // We can decode 'whole byte' logical bit shifts as shuffles.
30425 if ((ShiftVal.getZExtValue() % 8) == 0) {
30427 SmallVector<int, 1> NonceMask; // Just a placeholder.
30428 NonceMask.push_back(0);
30429 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30430 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30432 return SDValue(); // This routine will use CombineTo to replace N.
30438 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
30439 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
30440 /// OR -> CMPNEQSS.
30441 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
30442 TargetLowering::DAGCombinerInfo &DCI,
30443 const X86Subtarget &Subtarget) {
30446 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
30447 // we're requiring SSE2 for both.
30448 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
30449 SDValue N0 = N->getOperand(0);
30450 SDValue N1 = N->getOperand(1);
30451 SDValue CMP0 = N0->getOperand(1);
30452 SDValue CMP1 = N1->getOperand(1);
30455 // The SETCCs should both refer to the same CMP.
30456 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
30459 SDValue CMP00 = CMP0->getOperand(0);
30460 SDValue CMP01 = CMP0->getOperand(1);
30461 EVT VT = CMP00.getValueType();
30463 if (VT == MVT::f32 || VT == MVT::f64) {
30464 bool ExpectingFlags = false;
30465 // Check for any users that want flags:
30466 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
30467 !ExpectingFlags && UI != UE; ++UI)
30468 switch (UI->getOpcode()) {
30473 ExpectingFlags = true;
30475 case ISD::CopyToReg:
30476 case ISD::SIGN_EXTEND:
30477 case ISD::ZERO_EXTEND:
30478 case ISD::ANY_EXTEND:
30482 if (!ExpectingFlags) {
30483 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
30484 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
30486 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
30487 X86::CondCode tmp = cc0;
30492 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
30493 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
30494 // FIXME: need symbolic constants for these magic numbers.
30495 // See X86ATTInstPrinter.cpp:printSSECC().
30496 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
30497 if (Subtarget.hasAVX512()) {
30498 SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::i1, CMP00,
30500 DAG.getConstant(x86cc, DL, MVT::i8));
30501 if (N->getValueType(0) != MVT::i1)
30502 return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0),
30506 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
30507 CMP00.getValueType(), CMP00, CMP01,
30508 DAG.getConstant(x86cc, DL,
30511 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
30512 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
30514 if (is64BitFP && !Subtarget.is64Bit()) {
30515 // On a 32-bit target, we cannot bitcast the 64-bit float to a
30516 // 64-bit integer, since that's not a legal type. Since
30517 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
30518 // bits, but can do this little dance to extract the lowest 32 bits
30519 // and work with those going forward.
30520 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
30522 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
30523 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
30524 Vector32, DAG.getIntPtrConstant(0, DL));
30528 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
30529 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
30530 DAG.getConstant(1, DL, IntVT));
30531 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
30533 return OneBitOfTruth;
30541 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
30542 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
30543 assert(N->getOpcode() == ISD::AND);
30545 EVT VT = N->getValueType(0);
30546 SDValue N0 = N->getOperand(0);
30547 SDValue N1 = N->getOperand(1);
30550 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
30553 // Canonicalize XOR to the left.
30554 if (N1.getOpcode() == ISD::XOR)
30557 if (N0.getOpcode() != ISD::XOR)
30560 SDValue N00 = N0->getOperand(0);
30561 SDValue N01 = N0->getOperand(1);
30563 N01 = peekThroughBitcasts(N01);
30565 // Either match a direct AllOnes for 128, 256, and 512-bit vectors, or an
30566 // insert_subvector building a 256-bit AllOnes vector.
30567 if (!ISD::isBuildVectorAllOnes(N01.getNode())) {
30568 if (!VT.is256BitVector() || N01->getOpcode() != ISD::INSERT_SUBVECTOR)
30571 SDValue V1 = N01->getOperand(0);
30572 SDValue V2 = N01->getOperand(1);
30573 if (V1.getOpcode() != ISD::INSERT_SUBVECTOR ||
30574 !V1.getOperand(0).isUndef() ||
30575 !ISD::isBuildVectorAllOnes(V1.getOperand(1).getNode()) ||
30576 !ISD::isBuildVectorAllOnes(V2.getNode()))
30579 return DAG.getNode(X86ISD::ANDNP, DL, VT, N00, N1);
30582 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
30583 // register. In most cases we actually compare or select YMM-sized registers
30584 // and mixing the two types creates horrible code. This method optimizes
30585 // some of the transition sequences.
30586 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
30587 TargetLowering::DAGCombinerInfo &DCI,
30588 const X86Subtarget &Subtarget) {
30589 EVT VT = N->getValueType(0);
30590 if (!VT.is256BitVector())
30593 assert((N->getOpcode() == ISD::ANY_EXTEND ||
30594 N->getOpcode() == ISD::ZERO_EXTEND ||
30595 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
30597 SDValue Narrow = N->getOperand(0);
30598 EVT NarrowVT = Narrow->getValueType(0);
30599 if (!NarrowVT.is128BitVector())
30602 if (Narrow->getOpcode() != ISD::XOR &&
30603 Narrow->getOpcode() != ISD::AND &&
30604 Narrow->getOpcode() != ISD::OR)
30607 SDValue N0 = Narrow->getOperand(0);
30608 SDValue N1 = Narrow->getOperand(1);
30611 // The Left side has to be a trunc.
30612 if (N0.getOpcode() != ISD::TRUNCATE)
30615 // The type of the truncated inputs.
30616 EVT WideVT = N0->getOperand(0)->getValueType(0);
30620 // The right side has to be a 'trunc' or a constant vector.
30621 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
30622 ConstantSDNode *RHSConstSplat = nullptr;
30623 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
30624 RHSConstSplat = RHSBV->getConstantSplatNode();
30625 if (!RHSTrunc && !RHSConstSplat)
30628 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30630 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
30633 // Set N0 and N1 to hold the inputs to the new wide operation.
30634 N0 = N0->getOperand(0);
30635 if (RHSConstSplat) {
30636 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
30637 SDValue(RHSConstSplat, 0));
30638 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
30639 } else if (RHSTrunc) {
30640 N1 = N1->getOperand(0);
30643 // Generate the wide operation.
30644 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
30645 unsigned Opcode = N->getOpcode();
30647 case ISD::ANY_EXTEND:
30649 case ISD::ZERO_EXTEND: {
30650 unsigned InBits = NarrowVT.getScalarSizeInBits();
30651 APInt Mask = APInt::getAllOnesValue(InBits);
30652 Mask = Mask.zext(VT.getScalarSizeInBits());
30653 return DAG.getNode(ISD::AND, DL, VT,
30654 Op, DAG.getConstant(Mask, DL, VT));
30656 case ISD::SIGN_EXTEND:
30657 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
30658 Op, DAG.getValueType(NarrowVT));
30660 llvm_unreachable("Unexpected opcode");
30664 /// If both input operands of a logic op are being cast from floating point
30665 /// types, try to convert this into a floating point logic node to avoid
30666 /// unnecessary moves from SSE to integer registers.
30667 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
30668 const X86Subtarget &Subtarget) {
30669 unsigned FPOpcode = ISD::DELETED_NODE;
30670 if (N->getOpcode() == ISD::AND)
30671 FPOpcode = X86ISD::FAND;
30672 else if (N->getOpcode() == ISD::OR)
30673 FPOpcode = X86ISD::FOR;
30674 else if (N->getOpcode() == ISD::XOR)
30675 FPOpcode = X86ISD::FXOR;
30677 assert(FPOpcode != ISD::DELETED_NODE &&
30678 "Unexpected input node for FP logic conversion");
30680 EVT VT = N->getValueType(0);
30681 SDValue N0 = N->getOperand(0);
30682 SDValue N1 = N->getOperand(1);
30684 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
30685 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
30686 (Subtarget.hasSSE2() && VT == MVT::i64))) {
30687 SDValue N00 = N0.getOperand(0);
30688 SDValue N10 = N1.getOperand(0);
30689 EVT N00Type = N00.getValueType();
30690 EVT N10Type = N10.getValueType();
30691 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
30692 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
30693 return DAG.getBitcast(VT, FPLogic);
30699 /// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
30700 /// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
30701 /// eliminate loading the vector constant mask value. This relies on the fact
30702 /// that a PCMP always creates an all-ones or all-zeros bitmask per element.
30703 static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
30704 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
30705 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
30707 // TODO: Use AssertSext to mark any nodes that have the property of producing
30708 // all-ones or all-zeros. Then check for that node rather than particular
30710 if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
30713 // The existence of the PCMP node guarantees that we have the required SSE2 or
30714 // AVX2 for a shift of this vector type, but there is no vector shift by
30715 // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
30716 // masked compare nodes, so they should not make it here.
30717 EVT VT0 = Op0.getValueType();
30718 EVT VT1 = Op1.getValueType();
30719 unsigned EltBitWidth = VT0.getScalarSizeInBits();
30720 if (VT0 != VT1 || EltBitWidth == 8)
30723 assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
30726 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
30730 SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
30731 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
30732 return DAG.getBitcast(N->getValueType(0), Shift);
30735 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
30736 TargetLowering::DAGCombinerInfo &DCI,
30737 const X86Subtarget &Subtarget) {
30738 if (DCI.isBeforeLegalizeOps())
30741 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
30744 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
30747 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
30750 if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
30753 EVT VT = N->getValueType(0);
30754 SDValue N0 = N->getOperand(0);
30755 SDValue N1 = N->getOperand(1);
30758 // Attempt to recursively combine a bitmask AND with shuffles.
30759 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
30761 SmallVector<int, 1> NonceMask; // Just a placeholder.
30762 NonceMask.push_back(0);
30763 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
30764 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
30766 return SDValue(); // This routine will use CombineTo to replace N.
30769 // Create BEXTR instructions
30770 // BEXTR is ((X >> imm) & (2**size-1))
30771 if (VT != MVT::i32 && VT != MVT::i64)
30774 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
30776 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
30779 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
30780 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
30781 if (MaskNode && ShiftNode) {
30782 uint64_t Mask = MaskNode->getZExtValue();
30783 uint64_t Shift = ShiftNode->getZExtValue();
30784 if (isMask_64(Mask)) {
30785 uint64_t MaskSize = countPopulation(Mask);
30786 if (Shift + MaskSize <= VT.getSizeInBits())
30787 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
30788 DAG.getConstant(Shift | (MaskSize << 8), DL,
30796 // (or (and (m, y), (pandn m, x)))
30798 // (vselect m, x, y)
30799 // As a special case, try to fold:
30800 // (or (and (m, (sub 0, x)), (pandn m, x)))
30802 // (sub (xor X, M), M)
30803 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
30804 const X86Subtarget &Subtarget) {
30805 assert(N->getOpcode() == ISD::OR);
30807 SDValue N0 = N->getOperand(0);
30808 SDValue N1 = N->getOperand(1);
30809 EVT VT = N->getValueType(0);
30811 if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256())))
30813 assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!");
30815 // Canonicalize pandn to RHS
30816 if (N0.getOpcode() == X86ISD::ANDNP)
30819 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
30822 SDValue Mask = N1.getOperand(0);
30823 SDValue X = N1.getOperand(1);
30825 if (N0.getOperand(0) == Mask)
30826 Y = N0.getOperand(1);
30827 if (N0.getOperand(1) == Mask)
30828 Y = N0.getOperand(0);
30830 // Check to see if the mask appeared in both the AND and ANDNP.
30834 // Validate that X, Y, and Mask are bitcasts, and see through them.
30835 Mask = peekThroughBitcasts(Mask);
30836 X = peekThroughBitcasts(X);
30837 Y = peekThroughBitcasts(Y);
30839 EVT MaskVT = Mask.getValueType();
30841 // Validate that the Mask operand is a vector sra node.
30842 // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
30843 // there is no psrai.b
30844 unsigned EltBits = MaskVT.getScalarSizeInBits();
30845 unsigned SraAmt = ~0;
30846 if (Mask.getOpcode() == ISD::SRA) {
30847 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Mask.getOperand(1)))
30848 if (auto *AmtConst = AmtBV->getConstantSplatNode())
30849 SraAmt = AmtConst->getZExtValue();
30850 } else if (Mask.getOpcode() == X86ISD::VSRAI) {
30851 SDValue SraC = Mask.getOperand(1);
30852 SraAmt = cast<ConstantSDNode>(SraC)->getZExtValue();
30854 if ((SraAmt + 1) != EltBits)
30860 // (or (and (M, (sub 0, X)), (pandn M, X)))
30861 // which is a special case of vselect:
30862 // (vselect M, (sub 0, X), X)
30864 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
30865 // We know that, if fNegate is 0 or 1:
30866 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
30868 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
30869 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
30870 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
30871 // This lets us transform our vselect to:
30872 // (add (xor X, M), (and M, 1))
30874 // (sub (xor X, M), M)
30875 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) {
30876 auto IsNegV = [](SDNode *N, SDValue V) {
30877 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
30878 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
30881 if (IsNegV(Y.getNode(), X))
30883 else if (IsNegV(X.getNode(), Y))
30887 assert(EltBits == 8 || EltBits == 16 || EltBits == 32);
30888 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
30889 SDValue SubOp2 = Mask;
30891 // If the negate was on the false side of the select, then
30892 // the operands of the SUB need to be swapped. PR 27251.
30893 // This is because the pattern being matched above is
30894 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
30895 // but if the pattern matched was
30896 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
30897 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
30898 // pattern also needs to be a negation of the replacement pattern above.
30899 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
30900 // sub accomplishes the negation of the replacement pattern.
30902 std::swap(SubOp1, SubOp2);
30904 return DAG.getBitcast(VT,
30905 DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2));
30909 // PBLENDVB is only available on SSE 4.1.
30910 if (!Subtarget.hasSSE41())
30913 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
30915 X = DAG.getBitcast(BlendVT, X);
30916 Y = DAG.getBitcast(BlendVT, Y);
30917 Mask = DAG.getBitcast(BlendVT, Mask);
30918 Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X);
30919 return DAG.getBitcast(VT, Mask);
30922 // Helper function for combineOrCmpEqZeroToCtlzSrl
30926 // srl(ctlz x), log2(bitsize(x))
30927 // Input pattern is checked by caller.
30928 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
30929 SelectionDAG &DAG) {
30930 SDValue Cmp = Op.getOperand(1);
30931 EVT VT = Cmp.getOperand(0).getValueType();
30932 unsigned Log2b = Log2_32(VT.getSizeInBits());
30934 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
30935 // The result of the shift is true or false, and on X86, the 32-bit
30936 // encoding of shr and lzcnt is more desirable.
30937 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
30938 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
30939 DAG.getConstant(Log2b, dl, VT));
30940 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
30943 // Try to transform:
30944 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
30946 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
30947 // Will also attempt to match more generic cases, eg:
30948 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
30949 // Only applies if the target supports the FastLZCNT feature.
30950 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
30951 TargetLowering::DAGCombinerInfo &DCI,
30952 const X86Subtarget &Subtarget) {
30953 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
30956 auto isORCandidate = [](SDValue N) {
30957 return (N->getOpcode() == ISD::OR && N->hasOneUse());
30960 // Check the zero extend is extending to 32-bit or more. The code generated by
30961 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
30962 // instructions to clear the upper bits.
30963 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
30964 !isORCandidate(N->getOperand(0)))
30967 // Check the node matches: setcc(eq, cmp 0)
30968 auto isSetCCCandidate = [](SDValue N) {
30969 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
30970 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
30971 N->getOperand(1).getOpcode() == X86ISD::CMP &&
30972 N->getOperand(1).getConstantOperandVal(1) == 0 &&
30973 N->getOperand(1).getValueType().bitsGE(MVT::i32);
30976 SDNode *OR = N->getOperand(0).getNode();
30977 SDValue LHS = OR->getOperand(0);
30978 SDValue RHS = OR->getOperand(1);
30980 // Save nodes matching or(or, setcc(eq, cmp 0)).
30981 SmallVector<SDNode *, 2> ORNodes;
30982 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
30983 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
30984 ORNodes.push_back(OR);
30985 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
30986 LHS = OR->getOperand(0);
30987 RHS = OR->getOperand(1);
30990 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
30991 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
30992 !isORCandidate(SDValue(OR, 0)))
30995 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
30997 // or(srl(ctlz),srl(ctlz)).
30998 // The dag combiner can then fold it into:
30999 // srl(or(ctlz, ctlz)).
31000 EVT VT = OR->getValueType(0);
31001 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
31002 SDValue Ret, NewRHS;
31003 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
31004 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
31009 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
31010 while (ORNodes.size() > 0) {
31011 OR = ORNodes.pop_back_val();
31012 LHS = OR->getOperand(0);
31013 RHS = OR->getOperand(1);
31014 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
31015 if (RHS->getOpcode() == ISD::OR)
31016 std::swap(LHS, RHS);
31017 EVT VT = OR->getValueType(0);
31018 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
31021 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
31025 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
31030 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
31031 TargetLowering::DAGCombinerInfo &DCI,
31032 const X86Subtarget &Subtarget) {
31033 if (DCI.isBeforeLegalizeOps())
31036 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
31039 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
31042 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
31045 SDValue N0 = N->getOperand(0);
31046 SDValue N1 = N->getOperand(1);
31047 EVT VT = N->getValueType(0);
31049 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
31052 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
31053 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
31055 // SHLD/SHRD instructions have lower register pressure, but on some
31056 // platforms they have higher latency than the equivalent
31057 // series of shifts/or that would otherwise be generated.
31058 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
31059 // have higher latencies and we are not optimizing for size.
31060 if (!OptForSize && Subtarget.isSHLDSlow())
31063 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
31065 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
31067 if (!N0.hasOneUse() || !N1.hasOneUse())
31070 SDValue ShAmt0 = N0.getOperand(1);
31071 if (ShAmt0.getValueType() != MVT::i8)
31073 SDValue ShAmt1 = N1.getOperand(1);
31074 if (ShAmt1.getValueType() != MVT::i8)
31076 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
31077 ShAmt0 = ShAmt0.getOperand(0);
31078 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
31079 ShAmt1 = ShAmt1.getOperand(0);
31082 unsigned Opc = X86ISD::SHLD;
31083 SDValue Op0 = N0.getOperand(0);
31084 SDValue Op1 = N1.getOperand(0);
31085 if (ShAmt0.getOpcode() == ISD::SUB ||
31086 ShAmt0.getOpcode() == ISD::XOR) {
31087 Opc = X86ISD::SHRD;
31088 std::swap(Op0, Op1);
31089 std::swap(ShAmt0, ShAmt1);
31092 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
31093 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
31094 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
31095 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
31096 unsigned Bits = VT.getSizeInBits();
31097 if (ShAmt1.getOpcode() == ISD::SUB) {
31098 SDValue Sum = ShAmt1.getOperand(0);
31099 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
31100 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
31101 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
31102 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
31103 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
31104 return DAG.getNode(Opc, DL, VT,
31106 DAG.getNode(ISD::TRUNCATE, DL,
31109 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
31110 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
31111 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
31112 return DAG.getNode(Opc, DL, VT,
31113 N0.getOperand(0), N1.getOperand(0),
31114 DAG.getNode(ISD::TRUNCATE, DL,
31116 } else if (ShAmt1.getOpcode() == ISD::XOR) {
31117 SDValue Mask = ShAmt1.getOperand(1);
31118 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
31119 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
31120 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
31121 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
31122 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
31123 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
31124 if (Op1.getOpcode() == InnerShift &&
31125 isa<ConstantSDNode>(Op1.getOperand(1)) &&
31126 Op1.getConstantOperandVal(1) == 1) {
31127 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31128 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31130 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
31131 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
31132 Op1.getOperand(0) == Op1.getOperand(1)) {
31133 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
31134 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
31143 /// Generate NEG and CMOV for integer abs.
31144 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
31145 EVT VT = N->getValueType(0);
31147 // Since X86 does not have CMOV for 8-bit integer, we don't convert
31148 // 8-bit integer abs to NEG and CMOV.
31149 if (VT.isInteger() && VT.getSizeInBits() == 8)
31152 SDValue N0 = N->getOperand(0);
31153 SDValue N1 = N->getOperand(1);
31156 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
31157 // and change it to SUB and CMOV.
31158 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
31159 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
31160 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
31161 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
31162 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
31163 // Generate SUB & CMOV.
31164 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
31165 DAG.getConstant(0, DL, VT), N0.getOperand(0));
31166 SDValue Ops[] = {N0.getOperand(0), Neg,
31167 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
31168 SDValue(Neg.getNode(), 1)};
31169 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
31175 /// Try to turn tests against the signbit in the form of:
31176 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
31179 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
31180 // This is only worth doing if the output type is i8 or i1.
31181 EVT ResultType = N->getValueType(0);
31182 if (ResultType != MVT::i8 && ResultType != MVT::i1)
31185 SDValue N0 = N->getOperand(0);
31186 SDValue N1 = N->getOperand(1);
31188 // We should be performing an xor against a truncated shift.
31189 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
31192 // Make sure we are performing an xor against one.
31193 if (!isOneConstant(N1))
31196 // SetCC on x86 zero extends so only act on this if it's a logical shift.
31197 SDValue Shift = N0.getOperand(0);
31198 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
31201 // Make sure we are truncating from one of i16, i32 or i64.
31202 EVT ShiftTy = Shift.getValueType();
31203 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
31206 // Make sure the shift amount extracts the sign bit.
31207 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
31208 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
31211 // Create a greater-than comparison against -1.
31212 // N.B. Using SETGE against 0 works but we want a canonical looking
31213 // comparison, using SETGT matches up with what TranslateX86CC.
31215 SDValue ShiftOp = Shift.getOperand(0);
31216 EVT ShiftOpTy = ShiftOp.getValueType();
31217 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31218 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
31219 *DAG.getContext(), ResultType);
31220 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
31221 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
31222 if (SetCCResultType != ResultType)
31223 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
31227 /// Turn vector tests of the signbit in the form of:
31228 /// xor (sra X, elt_size(X)-1), -1
31232 /// This should be called before type legalization because the pattern may not
31233 /// persist after that.
31234 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
31235 const X86Subtarget &Subtarget) {
31236 EVT VT = N->getValueType(0);
31237 if (!VT.isSimple())
31240 switch (VT.getSimpleVT().SimpleTy) {
31241 default: return SDValue();
31244 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
31245 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
31249 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
31252 // There must be a shift right algebraic before the xor, and the xor must be a
31253 // 'not' operation.
31254 SDValue Shift = N->getOperand(0);
31255 SDValue Ones = N->getOperand(1);
31256 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
31257 !ISD::isBuildVectorAllOnes(Ones.getNode()))
31260 // The shift should be smearing the sign bit across each vector element.
31261 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
31265 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
31266 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
31267 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
31270 // Create a greater-than comparison against -1. We don't use the more obvious
31271 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
31272 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
31275 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
31276 /// is valid for the given \p Subtarget.
31277 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
31278 const X86Subtarget &Subtarget) {
31279 if (!Subtarget.hasAVX512())
31282 // FIXME: Scalar type may be supported if we move it to vector register.
31283 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
31286 EVT SrcElVT = SrcVT.getScalarType();
31287 EVT DstElVT = DstVT.getScalarType();
31288 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
31290 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
31292 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
31293 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
31297 /// Return true if VPACK* instruction can be used for the given types
31298 /// and it is avalable on \p Subtarget.
31300 isSATValidOnSSESubtarget(EVT SrcVT, EVT DstVT, const X86Subtarget &Subtarget) {
31301 if (Subtarget.hasSSE2())
31303 if (SrcVT == MVT::v16i16 && DstVT == MVT::v16i8)
31305 if (Subtarget.hasSSE41())
31307 if (SrcVT == MVT::v8i32 && DstVT == MVT::v8i16)
31312 /// Detect a pattern of truncation with saturation:
31313 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
31314 /// Return the source value to be truncated or SDValue() if the pattern was not
31316 static SDValue detectUSatPattern(SDValue In, EVT VT) {
31317 if (In.getOpcode() != ISD::UMIN)
31320 //Saturation with truncation. We truncate from InVT to VT.
31321 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
31322 "Unexpected types for truncate operation");
31325 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
31326 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
31327 // the element size of the destination type.
31328 return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
31334 /// Detect a pattern of truncation with saturation:
31335 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
31336 /// The types should allow to use VPMOVUS* instruction on AVX512.
31337 /// Return the source value to be truncated or SDValue() if the pattern was not
31339 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
31340 const X86Subtarget &Subtarget) {
31341 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
31343 return detectUSatPattern(In, VT);
31347 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
31348 const X86Subtarget &Subtarget) {
31349 SDValue USatVal = detectUSatPattern(In, VT);
31351 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
31352 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
31353 if (isSATValidOnSSESubtarget(In.getValueType(), VT, Subtarget)) {
31355 std::tie(Lo, Hi) = DAG.SplitVector(USatVal, DL);
31356 return DAG.getNode(X86ISD::PACKUS, DL, VT, Lo, Hi);
31362 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
31363 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
31364 /// X86ISD::AVG instruction.
31365 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
31366 const X86Subtarget &Subtarget,
31368 if (!VT.isVector() || !VT.isSimple())
31370 EVT InVT = In.getValueType();
31371 unsigned NumElems = VT.getVectorNumElements();
31373 EVT ScalarVT = VT.getVectorElementType();
31374 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
31375 isPowerOf2_32(NumElems)))
31378 // InScalarVT is the intermediate type in AVG pattern and it should be greater
31379 // than the original input type (i8/i16).
31380 EVT InScalarVT = InVT.getVectorElementType();
31381 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
31384 if (!Subtarget.hasSSE2())
31386 if (Subtarget.hasBWI()) {
31387 if (VT.getSizeInBits() > 512)
31389 } else if (Subtarget.hasAVX2()) {
31390 if (VT.getSizeInBits() > 256)
31393 if (VT.getSizeInBits() > 128)
31397 // Detect the following pattern:
31399 // %1 = zext <N x i8> %a to <N x i32>
31400 // %2 = zext <N x i8> %b to <N x i32>
31401 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
31402 // %4 = add nuw nsw <N x i32> %3, %2
31403 // %5 = lshr <N x i32> %N, <i32 1 x N>
31404 // %6 = trunc <N x i32> %5 to <N x i8>
31406 // In AVX512, the last instruction can also be a trunc store.
31408 if (In.getOpcode() != ISD::SRL)
31411 // A lambda checking the given SDValue is a constant vector and each element
31412 // is in the range [Min, Max].
31413 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
31414 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
31415 if (!BV || !BV->isConstant())
31417 for (unsigned i = 0, e = V.getNumOperands(); i < e; i++) {
31418 ConstantSDNode *C = dyn_cast<ConstantSDNode>(V.getOperand(i));
31421 uint64_t Val = C->getZExtValue();
31422 if (Val < Min || Val > Max)
31428 // Check if each element of the vector is left-shifted by one.
31429 auto LHS = In.getOperand(0);
31430 auto RHS = In.getOperand(1);
31431 if (!IsConstVectorInRange(RHS, 1, 1))
31433 if (LHS.getOpcode() != ISD::ADD)
31436 // Detect a pattern of a + b + 1 where the order doesn't matter.
31437 SDValue Operands[3];
31438 Operands[0] = LHS.getOperand(0);
31439 Operands[1] = LHS.getOperand(1);
31441 // Take care of the case when one of the operands is a constant vector whose
31442 // element is in the range [1, 256].
31443 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
31444 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
31445 Operands[0].getOperand(0).getValueType() == VT) {
31446 // The pattern is detected. Subtract one from the constant vector, then
31447 // demote it and emit X86ISD::AVG instruction.
31448 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
31449 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
31450 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
31451 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31455 if (Operands[0].getOpcode() == ISD::ADD)
31456 std::swap(Operands[0], Operands[1]);
31457 else if (Operands[1].getOpcode() != ISD::ADD)
31459 Operands[2] = Operands[1].getOperand(0);
31460 Operands[1] = Operands[1].getOperand(1);
31462 // Now we have three operands of two additions. Check that one of them is a
31463 // constant vector with ones, and the other two are promoted from i8/i16.
31464 for (int i = 0; i < 3; ++i) {
31465 if (!IsConstVectorInRange(Operands[i], 1, 1))
31467 std::swap(Operands[i], Operands[2]);
31469 // Check if Operands[0] and Operands[1] are results of type promotion.
31470 for (int j = 0; j < 2; ++j)
31471 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
31472 Operands[j].getOperand(0).getValueType() != VT)
31475 // The pattern is detected, emit X86ISD::AVG instruction.
31476 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
31477 Operands[1].getOperand(0));
31483 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
31484 TargetLowering::DAGCombinerInfo &DCI,
31485 const X86Subtarget &Subtarget) {
31486 LoadSDNode *Ld = cast<LoadSDNode>(N);
31487 EVT RegVT = Ld->getValueType(0);
31488 EVT MemVT = Ld->getMemoryVT();
31490 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31492 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
31493 // into two 16-byte operations.
31494 ISD::LoadExtType Ext = Ld->getExtensionType();
31496 unsigned AddressSpace = Ld->getAddressSpace();
31497 unsigned Alignment = Ld->getAlignment();
31498 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
31499 Ext == ISD::NON_EXTLOAD &&
31500 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
31501 AddressSpace, Alignment, &Fast) && !Fast) {
31502 unsigned NumElems = RegVT.getVectorNumElements();
31506 SDValue Ptr = Ld->getBasePtr();
31508 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
31511 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31512 Alignment, Ld->getMemOperand()->getFlags());
31514 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
31516 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
31517 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
31518 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
31520 Load2.getValue(1));
31522 SDValue NewVec = DAG.getUNDEF(RegVT);
31523 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
31524 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
31525 return DCI.CombineTo(N, NewVec, TF, true);
31531 /// If V is a build vector of boolean constants and exactly one of those
31532 /// constants is true, return the operand index of that true element.
31533 /// Otherwise, return -1.
31534 static int getOneTrueElt(SDValue V) {
31535 // This needs to be a build vector of booleans.
31536 // TODO: Checking for the i1 type matches the IR definition for the mask,
31537 // but the mask check could be loosened to i8 or other types. That might
31538 // also require checking more than 'allOnesValue'; eg, the x86 HW
31539 // instructions only require that the MSB is set for each mask element.
31540 // The ISD::MSTORE comments/definition do not specify how the mask operand
31542 auto *BV = dyn_cast<BuildVectorSDNode>(V);
31543 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
31546 int TrueIndex = -1;
31547 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
31548 for (unsigned i = 0; i < NumElts; ++i) {
31549 const SDValue &Op = BV->getOperand(i);
31552 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
31555 if (ConstNode->getAPIntValue().isAllOnesValue()) {
31556 // If we already found a one, this is too many.
31557 if (TrueIndex >= 0)
31565 /// Given a masked memory load/store operation, return true if it has one mask
31566 /// bit set. If it has one mask bit set, then also return the memory address of
31567 /// the scalar element to load/store, the vector index to insert/extract that
31568 /// scalar element, and the alignment for the scalar memory access.
31569 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
31570 SelectionDAG &DAG, SDValue &Addr,
31571 SDValue &Index, unsigned &Alignment) {
31572 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
31573 if (TrueMaskElt < 0)
31576 // Get the address of the one scalar element that is specified by the mask
31577 // using the appropriate offset from the base pointer.
31578 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
31579 Addr = MaskedOp->getBasePtr();
31580 if (TrueMaskElt != 0) {
31581 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
31582 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
31585 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
31586 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
31590 /// If exactly one element of the mask is set for a non-extending masked load,
31591 /// it is a scalar load and vector insert.
31592 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31593 /// mask have already been optimized in IR, so we don't bother with those here.
31595 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31596 TargetLowering::DAGCombinerInfo &DCI) {
31597 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31598 // However, some target hooks may need to be added to know when the transform
31599 // is profitable. Endianness would also have to be considered.
31601 SDValue Addr, VecIndex;
31602 unsigned Alignment;
31603 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
31606 // Load the one scalar element that is specified by the mask using the
31607 // appropriate offset from the base pointer.
31609 EVT VT = ML->getValueType(0);
31610 EVT EltVT = VT.getVectorElementType();
31612 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
31613 Alignment, ML->getMemOperand()->getFlags());
31615 // Insert the loaded element into the appropriate place in the vector.
31616 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
31618 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
31622 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
31623 TargetLowering::DAGCombinerInfo &DCI) {
31624 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
31628 EVT VT = ML->getValueType(0);
31630 // If we are loading the first and last elements of a vector, it is safe and
31631 // always faster to load the whole vector. Replace the masked load with a
31632 // vector load and select.
31633 unsigned NumElts = VT.getVectorNumElements();
31634 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
31635 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
31636 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
31637 if (LoadFirstElt && LoadLastElt) {
31638 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31639 ML->getMemOperand());
31640 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
31641 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
31644 // Convert a masked load with a constant mask into a masked load and a select.
31645 // This allows the select operation to use a faster kind of select instruction
31646 // (for example, vblendvps -> vblendps).
31648 // Don't try this if the pass-through operand is already undefined. That would
31649 // cause an infinite loop because that's what we're about to create.
31650 if (ML->getSrc0().isUndef())
31653 // The new masked load has an undef pass-through operand. The select uses the
31654 // original pass-through operand.
31655 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
31656 ML->getMask(), DAG.getUNDEF(VT),
31657 ML->getMemoryVT(), ML->getMemOperand(),
31658 ML->getExtensionType());
31659 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
31661 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
31664 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
31665 TargetLowering::DAGCombinerInfo &DCI,
31666 const X86Subtarget &Subtarget) {
31667 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
31669 // TODO: Expanding load with constant mask may be optimized as well.
31670 if (Mld->isExpandingLoad())
31673 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
31674 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
31676 // TODO: Do some AVX512 subsets benefit from this transform?
31677 if (!Subtarget.hasAVX512())
31678 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
31682 if (Mld->getExtensionType() != ISD::SEXTLOAD)
31685 // Resolve extending loads.
31686 EVT VT = Mld->getValueType(0);
31687 unsigned NumElems = VT.getVectorNumElements();
31688 EVT LdVT = Mld->getMemoryVT();
31691 assert(LdVT != VT && "Cannot extend to the same type");
31692 unsigned ToSz = VT.getScalarSizeInBits();
31693 unsigned FromSz = LdVT.getScalarSizeInBits();
31694 // From/To sizes and ElemCount must be pow of two.
31695 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31696 "Unexpected size for extending masked load");
31698 unsigned SizeRatio = ToSz / FromSz;
31699 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
31701 // Create a type on which we perform the shuffle.
31702 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31703 LdVT.getScalarType(), NumElems*SizeRatio);
31704 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31706 // Convert Src0 value.
31707 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
31708 if (!Mld->getSrc0().isUndef()) {
31709 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31710 for (unsigned i = 0; i != NumElems; ++i)
31711 ShuffleVec[i] = i * SizeRatio;
31713 // Can't shuffle using an illegal type.
31714 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31715 "WideVecVT should be legal");
31716 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
31717 DAG.getUNDEF(WideVecVT), ShuffleVec);
31719 // Prepare the new mask.
31721 SDValue Mask = Mld->getMask();
31722 if (Mask.getValueType() == VT) {
31723 // Mask and original value have the same type.
31724 NewMask = DAG.getBitcast(WideVecVT, Mask);
31725 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31726 for (unsigned i = 0; i != NumElems; ++i)
31727 ShuffleVec[i] = i * SizeRatio;
31728 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
31729 ShuffleVec[i] = NumElems * SizeRatio;
31730 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31731 DAG.getConstant(0, dl, WideVecVT),
31734 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31735 unsigned WidenNumElts = NumElems*SizeRatio;
31736 unsigned MaskNumElts = VT.getVectorNumElements();
31737 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31740 unsigned NumConcat = WidenNumElts / MaskNumElts;
31741 SmallVector<SDValue, 16> Ops(NumConcat);
31742 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31744 for (unsigned i = 1; i != NumConcat; ++i)
31747 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31750 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
31751 Mld->getBasePtr(), NewMask, WideSrc0,
31752 Mld->getMemoryVT(), Mld->getMemOperand(),
31754 SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
31755 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
31758 /// If exactly one element of the mask is set for a non-truncating masked store,
31759 /// it is a vector extract and scalar store.
31760 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
31761 /// mask have already been optimized in IR, so we don't bother with those here.
31762 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
31763 SelectionDAG &DAG) {
31764 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
31765 // However, some target hooks may need to be added to know when the transform
31766 // is profitable. Endianness would also have to be considered.
31768 SDValue Addr, VecIndex;
31769 unsigned Alignment;
31770 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
31773 // Extract the one scalar element that is actually being stored.
31775 EVT VT = MS->getValue().getValueType();
31776 EVT EltVT = VT.getVectorElementType();
31777 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
31778 MS->getValue(), VecIndex);
31780 // Store that element at the appropriate offset from the base pointer.
31781 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
31782 Alignment, MS->getMemOperand()->getFlags());
31785 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
31786 const X86Subtarget &Subtarget) {
31787 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
31789 if (Mst->isCompressingStore())
31792 if (!Mst->isTruncatingStore())
31793 return reduceMaskedStoreToScalarStore(Mst, DAG);
31795 // Resolve truncating stores.
31796 EVT VT = Mst->getValue().getValueType();
31797 unsigned NumElems = VT.getVectorNumElements();
31798 EVT StVT = Mst->getMemoryVT();
31801 assert(StVT != VT && "Cannot truncate to the same type");
31802 unsigned FromSz = VT.getScalarSizeInBits();
31803 unsigned ToSz = StVT.getScalarSizeInBits();
31805 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31807 // The truncating store is legal in some cases. For example
31808 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31809 // are designated for truncate store.
31810 // In this case we don't need any further transformations.
31811 if (TLI.isTruncStoreLegal(VT, StVT))
31814 // From/To sizes and ElemCount must be pow of two.
31815 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
31816 "Unexpected size for truncating masked store");
31817 // We are going to use the original vector elt for storing.
31818 // Accumulated smaller vector elements must be a multiple of the store size.
31819 assert (((NumElems * FromSz) % ToSz) == 0 &&
31820 "Unexpected ratio for truncating masked store");
31822 unsigned SizeRatio = FromSz / ToSz;
31823 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31825 // Create a type on which we perform the shuffle.
31826 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31827 StVT.getScalarType(), NumElems*SizeRatio);
31829 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31831 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
31832 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
31833 for (unsigned i = 0; i != NumElems; ++i)
31834 ShuffleVec[i] = i * SizeRatio;
31836 // Can't shuffle using an illegal type.
31837 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
31838 "WideVecVT should be legal");
31840 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31841 DAG.getUNDEF(WideVecVT),
31845 SDValue Mask = Mst->getMask();
31846 if (Mask.getValueType() == VT) {
31847 // Mask and original value have the same type.
31848 NewMask = DAG.getBitcast(WideVecVT, Mask);
31849 for (unsigned i = 0; i != NumElems; ++i)
31850 ShuffleVec[i] = i * SizeRatio;
31851 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
31852 ShuffleVec[i] = NumElems*SizeRatio;
31853 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
31854 DAG.getConstant(0, dl, WideVecVT),
31857 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
31858 unsigned WidenNumElts = NumElems*SizeRatio;
31859 unsigned MaskNumElts = VT.getVectorNumElements();
31860 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
31863 unsigned NumConcat = WidenNumElts / MaskNumElts;
31864 SmallVector<SDValue, 16> Ops(NumConcat);
31865 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
31867 for (unsigned i = 1; i != NumConcat; ++i)
31870 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
31873 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
31874 Mst->getBasePtr(), NewMask, StVT,
31875 Mst->getMemOperand(), false);
31878 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
31879 const X86Subtarget &Subtarget) {
31880 StoreSDNode *St = cast<StoreSDNode>(N);
31881 EVT VT = St->getValue().getValueType();
31882 EVT StVT = St->getMemoryVT();
31884 SDValue StoredVal = St->getOperand(1);
31885 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31887 // If we are saving a concatenation of two XMM registers and 32-byte stores
31888 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
31890 unsigned AddressSpace = St->getAddressSpace();
31891 unsigned Alignment = St->getAlignment();
31892 if (VT.is256BitVector() && StVT == VT &&
31893 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
31894 AddressSpace, Alignment, &Fast) &&
31896 unsigned NumElems = VT.getVectorNumElements();
31900 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
31901 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
31903 SDValue Ptr0 = St->getBasePtr();
31904 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
31907 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
31908 Alignment, St->getMemOperand()->getFlags());
31910 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
31911 std::min(16U, Alignment), St->getMemOperand()->getFlags());
31912 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
31915 // Optimize trunc store (of multiple scalars) to shuffle and store.
31916 // First, pack all of the elements in one place. Next, store to memory
31917 // in fewer chunks.
31918 if (St->isTruncatingStore() && VT.isVector()) {
31919 // Check if we can detect an AVG pattern from the truncation. If yes,
31920 // replace the trunc store by a normal store with the result of X86ISD::AVG
31922 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
31924 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
31925 St->getPointerInfo(), St->getAlignment(),
31926 St->getMemOperand()->getFlags());
31929 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
31930 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
31931 dl, Val, St->getBasePtr(),
31932 St->getMemoryVT(), St->getMemOperand(), DAG);
31934 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31935 unsigned NumElems = VT.getVectorNumElements();
31936 assert(StVT != VT && "Cannot truncate to the same type");
31937 unsigned FromSz = VT.getScalarSizeInBits();
31938 unsigned ToSz = StVT.getScalarSizeInBits();
31940 // The truncating store is legal in some cases. For example
31941 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
31942 // are designated for truncate store.
31943 // In this case we don't need any further transformations.
31944 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
31947 // From, To sizes and ElemCount must be pow of two
31948 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
31949 // We are going to use the original vector elt for storing.
31950 // Accumulated smaller vector elements must be a multiple of the store size.
31951 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
31953 unsigned SizeRatio = FromSz / ToSz;
31955 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
31957 // Create a type on which we perform the shuffle
31958 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
31959 StVT.getScalarType(), NumElems*SizeRatio);
31961 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
31963 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
31964 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
31965 for (unsigned i = 0; i != NumElems; ++i)
31966 ShuffleVec[i] = i * SizeRatio;
31968 // Can't shuffle using an illegal type.
31969 if (!TLI.isTypeLegal(WideVecVT))
31972 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
31973 DAG.getUNDEF(WideVecVT),
31975 // At this point all of the data is stored at the bottom of the
31976 // register. We now need to save it to mem.
31978 // Find the largest store unit
31979 MVT StoreType = MVT::i8;
31980 for (MVT Tp : MVT::integer_valuetypes()) {
31981 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
31985 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
31986 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
31987 (64 <= NumElems * ToSz))
31988 StoreType = MVT::f64;
31990 // Bitcast the original vector into a vector of store-size units
31991 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
31992 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
31993 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
31994 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
31995 SmallVector<SDValue, 8> Chains;
31996 SDValue Ptr = St->getBasePtr();
31998 // Perform one or more big stores into memory.
31999 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
32000 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
32001 StoreType, ShuffWide,
32002 DAG.getIntPtrConstant(i, dl));
32004 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
32005 St->getAlignment(), St->getMemOperand()->getFlags());
32006 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
32007 Chains.push_back(Ch);
32010 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
32013 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
32014 // the FP state in cases where an emms may be missing.
32015 // A preferable solution to the general problem is to figure out the right
32016 // places to insert EMMS. This qualifies as a quick hack.
32018 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
32019 if (VT.getSizeInBits() != 64)
32022 const Function *F = DAG.getMachineFunction().getFunction();
32023 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
32025 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
32026 if ((VT.isVector() ||
32027 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
32028 isa<LoadSDNode>(St->getValue()) &&
32029 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
32030 St->getChain().hasOneUse() && !St->isVolatile()) {
32031 SDNode* LdVal = St->getValue().getNode();
32032 LoadSDNode *Ld = nullptr;
32033 int TokenFactorIndex = -1;
32034 SmallVector<SDValue, 8> Ops;
32035 SDNode* ChainVal = St->getChain().getNode();
32036 // Must be a store of a load. We currently handle two cases: the load
32037 // is a direct child, and it's under an intervening TokenFactor. It is
32038 // possible to dig deeper under nested TokenFactors.
32039 if (ChainVal == LdVal)
32040 Ld = cast<LoadSDNode>(St->getChain());
32041 else if (St->getValue().hasOneUse() &&
32042 ChainVal->getOpcode() == ISD::TokenFactor) {
32043 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
32044 if (ChainVal->getOperand(i).getNode() == LdVal) {
32045 TokenFactorIndex = i;
32046 Ld = cast<LoadSDNode>(St->getValue());
32048 Ops.push_back(ChainVal->getOperand(i));
32052 if (!Ld || !ISD::isNormalLoad(Ld))
32055 // If this is not the MMX case, i.e. we are just turning i64 load/store
32056 // into f64 load/store, avoid the transformation if there are multiple
32057 // uses of the loaded value.
32058 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
32063 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
32064 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
32066 if (Subtarget.is64Bit() || F64IsLegal) {
32067 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
32068 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
32069 Ld->getPointerInfo(), Ld->getAlignment(),
32070 Ld->getMemOperand()->getFlags());
32071 SDValue NewChain = NewLd.getValue(1);
32072 if (TokenFactorIndex >= 0) {
32073 Ops.push_back(NewChain);
32074 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32076 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
32077 St->getPointerInfo(), St->getAlignment(),
32078 St->getMemOperand()->getFlags());
32081 // Otherwise, lower to two pairs of 32-bit loads / stores.
32082 SDValue LoAddr = Ld->getBasePtr();
32083 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
32085 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
32086 Ld->getPointerInfo(), Ld->getAlignment(),
32087 Ld->getMemOperand()->getFlags());
32088 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
32089 Ld->getPointerInfo().getWithOffset(4),
32090 MinAlign(Ld->getAlignment(), 4),
32091 Ld->getMemOperand()->getFlags());
32093 SDValue NewChain = LoLd.getValue(1);
32094 if (TokenFactorIndex >= 0) {
32095 Ops.push_back(LoLd);
32096 Ops.push_back(HiLd);
32097 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
32100 LoAddr = St->getBasePtr();
32101 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
32104 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
32105 St->getAlignment(), St->getMemOperand()->getFlags());
32106 SDValue HiSt = DAG.getStore(
32107 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
32108 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
32109 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
32112 // This is similar to the above case, but here we handle a scalar 64-bit
32113 // integer store that is extracted from a vector on a 32-bit target.
32114 // If we have SSE2, then we can treat it like a floating-point double
32115 // to get past legalization. The execution dependencies fixup pass will
32116 // choose the optimal machine instruction for the store if this really is
32117 // an integer or v2f32 rather than an f64.
32118 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
32119 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
32120 SDValue OldExtract = St->getOperand(1);
32121 SDValue ExtOp0 = OldExtract.getOperand(0);
32122 unsigned VecSize = ExtOp0.getValueSizeInBits();
32123 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
32124 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
32125 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
32126 BitCast, OldExtract.getOperand(1));
32127 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
32128 St->getPointerInfo(), St->getAlignment(),
32129 St->getMemOperand()->getFlags());
32135 /// Return 'true' if this vector operation is "horizontal"
32136 /// and return the operands for the horizontal operation in LHS and RHS. A
32137 /// horizontal operation performs the binary operation on successive elements
32138 /// of its first operand, then on successive elements of its second operand,
32139 /// returning the resulting values in a vector. For example, if
32140 /// A = < float a0, float a1, float a2, float a3 >
32142 /// B = < float b0, float b1, float b2, float b3 >
32143 /// then the result of doing a horizontal operation on A and B is
32144 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
32145 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
32146 /// A horizontal-op B, for some already available A and B, and if so then LHS is
32147 /// set to A, RHS to B, and the routine returns 'true'.
32148 /// Note that the binary operation should have the property that if one of the
32149 /// operands is UNDEF then the result is UNDEF.
32150 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
32151 // Look for the following pattern: if
32152 // A = < float a0, float a1, float a2, float a3 >
32153 // B = < float b0, float b1, float b2, float b3 >
32155 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
32156 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
32157 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
32158 // which is A horizontal-op B.
32160 // At least one of the operands should be a vector shuffle.
32161 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
32162 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
32165 MVT VT = LHS.getSimpleValueType();
32167 assert((VT.is128BitVector() || VT.is256BitVector()) &&
32168 "Unsupported vector type for horizontal add/sub");
32170 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
32171 // operate independently on 128-bit lanes.
32172 unsigned NumElts = VT.getVectorNumElements();
32173 unsigned NumLanes = VT.getSizeInBits()/128;
32174 unsigned NumLaneElts = NumElts / NumLanes;
32175 assert((NumLaneElts % 2 == 0) &&
32176 "Vector type should have an even number of elements in each lane");
32177 unsigned HalfLaneElts = NumLaneElts/2;
32179 // View LHS in the form
32180 // LHS = VECTOR_SHUFFLE A, B, LMask
32181 // If LHS is not a shuffle then pretend it is the shuffle
32182 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
32183 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
32186 SmallVector<int, 16> LMask(NumElts);
32187 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
32188 if (!LHS.getOperand(0).isUndef())
32189 A = LHS.getOperand(0);
32190 if (!LHS.getOperand(1).isUndef())
32191 B = LHS.getOperand(1);
32192 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
32193 std::copy(Mask.begin(), Mask.end(), LMask.begin());
32195 if (!LHS.isUndef())
32197 for (unsigned i = 0; i != NumElts; ++i)
32201 // Likewise, view RHS in the form
32202 // RHS = VECTOR_SHUFFLE C, D, RMask
32204 SmallVector<int, 16> RMask(NumElts);
32205 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
32206 if (!RHS.getOperand(0).isUndef())
32207 C = RHS.getOperand(0);
32208 if (!RHS.getOperand(1).isUndef())
32209 D = RHS.getOperand(1);
32210 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
32211 std::copy(Mask.begin(), Mask.end(), RMask.begin());
32213 if (!RHS.isUndef())
32215 for (unsigned i = 0; i != NumElts; ++i)
32219 // Check that the shuffles are both shuffling the same vectors.
32220 if (!(A == C && B == D) && !(A == D && B == C))
32223 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
32224 if (!A.getNode() && !B.getNode())
32227 // If A and B occur in reverse order in RHS, then "swap" them (which means
32228 // rewriting the mask).
32230 ShuffleVectorSDNode::commuteMask(RMask);
32232 // At this point LHS and RHS are equivalent to
32233 // LHS = VECTOR_SHUFFLE A, B, LMask
32234 // RHS = VECTOR_SHUFFLE A, B, RMask
32235 // Check that the masks correspond to performing a horizontal operation.
32236 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
32237 for (unsigned i = 0; i != NumLaneElts; ++i) {
32238 int LIdx = LMask[i+l], RIdx = RMask[i+l];
32240 // Ignore any UNDEF components.
32241 if (LIdx < 0 || RIdx < 0 ||
32242 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
32243 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
32246 // Check that successive elements are being operated on. If not, this is
32247 // not a horizontal operation.
32248 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
32249 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
32250 if (!(LIdx == Index && RIdx == Index + 1) &&
32251 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
32256 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
32257 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
32261 /// Do target-specific dag combines on floating-point adds/subs.
32262 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
32263 const X86Subtarget &Subtarget) {
32264 EVT VT = N->getValueType(0);
32265 SDValue LHS = N->getOperand(0);
32266 SDValue RHS = N->getOperand(1);
32267 bool IsFadd = N->getOpcode() == ISD::FADD;
32268 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
32270 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
32271 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
32272 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
32273 isHorizontalBinOp(LHS, RHS, IsFadd)) {
32274 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
32275 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
32280 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
32282 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
32283 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
32284 const X86Subtarget &Subtarget,
32286 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
32287 SDValue Src = N->getOperand(0);
32288 unsigned Opcode = Src.getOpcode();
32289 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32291 EVT VT = N->getValueType(0);
32292 EVT SrcVT = Src.getValueType();
32294 auto IsRepeatedOpOrOneUseConstant = [](SDValue Op0, SDValue Op1) {
32295 // TODO: Add extra cases where we can truncate both inputs for the
32296 // cost of one (or none).
32297 // e.g. TRUNC( BINOP( EXT( X ), EXT( Y ) ) ) --> BINOP( X, Y )
32301 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
32302 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
32303 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
32304 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
32307 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
32308 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
32309 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
32310 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
32313 // Don't combine if the operation has other uses.
32314 if (!N->isOnlyUserOf(Src.getNode()))
32317 // Only support vector truncation for now.
32318 // TODO: i64 scalar math would benefit as well.
32319 if (!VT.isVector())
32322 // In most cases its only worth pre-truncating if we're only facing the cost
32323 // of one truncation.
32324 // i.e. if one of the inputs will constant fold or the input is repeated.
32329 SDValue Op0 = Src.getOperand(0);
32330 SDValue Op1 = Src.getOperand(1);
32331 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
32332 IsRepeatedOpOrOneUseConstant(Op0, Op1))
32333 return TruncateArithmetic(Op0, Op1);
32338 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
32339 // better to truncate if we have the chance.
32340 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
32341 !TLI.isOperationLegal(Opcode, SrcVT))
32342 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
32345 SDValue Op0 = Src.getOperand(0);
32346 SDValue Op1 = Src.getOperand(1);
32347 if (TLI.isOperationLegal(Opcode, VT) &&
32348 IsRepeatedOpOrOneUseConstant(Op0, Op1))
32349 return TruncateArithmetic(Op0, Op1);
32357 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
32359 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
32360 SmallVector<SDValue, 8> &Regs) {
32361 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
32362 Regs[0].getValueType() == MVT::v2i64));
32363 EVT OutVT = N->getValueType(0);
32364 EVT OutSVT = OutVT.getVectorElementType();
32365 EVT InVT = Regs[0].getValueType();
32366 EVT InSVT = InVT.getVectorElementType();
32369 // First, use mask to unset all bits that won't appear in the result.
32370 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
32371 "OutSVT can only be either i8 or i16.");
32373 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
32374 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
32375 for (auto &Reg : Regs)
32376 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
32378 MVT UnpackedVT, PackedVT;
32379 if (OutSVT == MVT::i8) {
32380 UnpackedVT = MVT::v8i16;
32381 PackedVT = MVT::v16i8;
32383 UnpackedVT = MVT::v4i32;
32384 PackedVT = MVT::v8i16;
32387 // In each iteration, truncate the type by a half size.
32388 auto RegNum = Regs.size();
32389 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
32390 j < e; j *= 2, RegNum /= 2) {
32391 for (unsigned i = 0; i < RegNum; i++)
32392 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
32393 for (unsigned i = 0; i < RegNum / 2; i++)
32394 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
32398 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
32399 // then extract a subvector as the result since v8i8 is not a legal type.
32400 if (OutVT == MVT::v8i8) {
32401 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
32402 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
32403 DAG.getIntPtrConstant(0, DL));
32405 } else if (RegNum > 1) {
32406 Regs.resize(RegNum);
32407 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
32412 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
32414 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
32416 SmallVector<SDValue, 8> &Regs) {
32417 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
32418 EVT OutVT = N->getValueType(0);
32421 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
32422 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
32423 for (auto &Reg : Regs) {
32424 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
32426 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
32430 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
32431 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
32434 if (Regs.size() > 2) {
32435 Regs.resize(Regs.size() / 2);
32436 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
32441 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
32442 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
32443 /// legalization the truncation will be translated into a BUILD_VECTOR with each
32444 /// element that is extracted from a vector and then truncated, and it is
32445 /// difficult to do this optimization based on them.
32446 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
32447 const X86Subtarget &Subtarget) {
32448 EVT OutVT = N->getValueType(0);
32449 if (!OutVT.isVector())
32452 SDValue In = N->getOperand(0);
32453 if (!In.getValueType().isSimple())
32456 EVT InVT = In.getValueType();
32457 unsigned NumElems = OutVT.getVectorNumElements();
32459 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
32460 // SSE2, and we need to take care of it specially.
32461 // AVX512 provides vpmovdb.
32462 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
32465 EVT OutSVT = OutVT.getVectorElementType();
32466 EVT InSVT = InVT.getVectorElementType();
32467 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
32468 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
32472 // SSSE3's pshufb results in less instructions in the cases below.
32473 if (Subtarget.hasSSSE3() && NumElems == 8 &&
32474 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
32475 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
32480 // Split a long vector into vectors of legal type.
32481 unsigned RegNum = InVT.getSizeInBits() / 128;
32482 SmallVector<SDValue, 8> SubVec(RegNum);
32483 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
32484 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
32486 for (unsigned i = 0; i < RegNum; i++)
32487 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
32488 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
32490 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
32491 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
32492 // truncate 2 x v4i32 to v8i16.
32493 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
32494 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
32495 else if (InSVT == MVT::i32)
32496 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
32501 /// This function transforms vector truncation of 'all or none' bits values.
32502 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
32503 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
32505 const X86Subtarget &Subtarget) {
32506 // Requires SSE2 but AVX512 has fast truncate.
32507 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
32510 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
32513 SDValue In = N->getOperand(0);
32514 if (!In.getValueType().isSimple())
32517 MVT VT = N->getValueType(0).getSimpleVT();
32518 MVT SVT = VT.getScalarType();
32520 MVT InVT = In.getValueType().getSimpleVT();
32521 MVT InSVT = InVT.getScalarType();
32523 // Use PACKSS if the input is a splatted sign bit.
32524 // e.g. Comparison result, sext_in_reg, etc.
32525 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
32526 if (NumSignBits != InSVT.getSizeInBits())
32529 // Check we have a truncation suited for PACKSS.
32530 if (!VT.is128BitVector() && !VT.is256BitVector())
32532 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
32534 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
32537 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
32540 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
32541 const X86Subtarget &Subtarget) {
32542 EVT VT = N->getValueType(0);
32543 SDValue Src = N->getOperand(0);
32546 // Attempt to pre-truncate inputs to arithmetic ops instead.
32547 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
32550 // Try to detect AVG pattern first.
32551 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
32554 // Try to combine truncation with unsigned saturation.
32555 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
32558 // The bitcast source is a direct mmx result.
32559 // Detect bitcasts between i32 to x86mmx
32560 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
32561 SDValue BCSrc = Src.getOperand(0);
32562 if (BCSrc.getValueType() == MVT::x86mmx)
32563 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
32566 // Try to truncate extended sign bits with PACKSS.
32567 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
32570 return combineVectorTruncation(N, DAG, Subtarget);
32573 /// Returns the negated value if the node \p N flips sign of FP value.
32575 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
32576 /// AVX512F does not have FXOR, so FNEG is lowered as
32577 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
32578 /// In this case we go though all bitcasts.
32579 static SDValue isFNEG(SDNode *N) {
32580 if (N->getOpcode() == ISD::FNEG)
32581 return N->getOperand(0);
32583 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
32584 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
32587 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
32588 if (!Op1.getValueType().isFloatingPoint())
32591 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
32593 unsigned EltBits = Op1.getScalarValueSizeInBits();
32594 auto isSignBitValue = [&](const ConstantFP *C) {
32595 return C->getValueAPF().bitcastToAPInt() == APInt::getSignBit(EltBits);
32598 // There is more than one way to represent the same constant on
32599 // the different X86 targets. The type of the node may also depend on size.
32600 // - load scalar value and broadcast
32601 // - BUILD_VECTOR node
32602 // - load from a constant pool.
32603 // We check all variants here.
32604 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
32605 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
32606 if (isSignBitValue(cast<ConstantFP>(C)))
32609 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
32610 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
32611 if (isSignBitValue(CN->getConstantFPValue()))
32614 } else if (auto *C = getTargetConstantFromNode(Op1)) {
32615 if (C->getType()->isVectorTy()) {
32616 if (auto *SplatV = C->getSplatValue())
32617 if (isSignBitValue(cast<ConstantFP>(SplatV)))
32619 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
32620 if (isSignBitValue(FPConst))
32626 /// Do target-specific dag combines on floating point negations.
32627 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
32628 const X86Subtarget &Subtarget) {
32629 EVT OrigVT = N->getValueType(0);
32630 SDValue Arg = isFNEG(N);
32631 assert(Arg.getNode() && "N is expected to be an FNEG node");
32633 EVT VT = Arg.getValueType();
32634 EVT SVT = VT.getScalarType();
32637 // Let legalize expand this if it isn't a legal type yet.
32638 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
32641 // If we're negating a FMUL node on a target with FMA, then we can avoid the
32642 // use of a constant by performing (-0 - A*B) instead.
32643 // FIXME: Check rounding control flags as well once it becomes available.
32644 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
32645 Arg->getFlags()->hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
32646 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
32647 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
32648 Arg.getOperand(1), Zero);
32649 return DAG.getBitcast(OrigVT, NewNode);
32652 // If we're negating an FMA node, then we can adjust the
32653 // instruction to include the extra negation.
32654 unsigned NewOpcode = 0;
32655 if (Arg.hasOneUse()) {
32656 switch (Arg.getOpcode()) {
32657 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
32658 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
32659 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
32660 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
32661 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
32662 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
32663 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
32664 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
32665 // We can't handle scalar intrinsic node here because it would only
32666 // invert one element and not the whole vector. But we could try to handle
32667 // a negation of the lower element only.
32671 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
32672 Arg.getNode()->ops()));
32677 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
32678 const X86Subtarget &Subtarget) {
32679 MVT VT = N->getSimpleValueType(0);
32680 // If we have integer vector types available, use the integer opcodes.
32681 if (VT.isVector() && Subtarget.hasSSE2()) {
32684 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
32686 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
32687 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
32688 unsigned IntOpcode;
32689 switch (N->getOpcode()) {
32690 default: llvm_unreachable("Unexpected FP logic op");
32691 case X86ISD::FOR: IntOpcode = ISD::OR; break;
32692 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
32693 case X86ISD::FAND: IntOpcode = ISD::AND; break;
32694 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
32696 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
32697 return DAG.getBitcast(VT, IntOp);
32702 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
32703 TargetLowering::DAGCombinerInfo &DCI,
32704 const X86Subtarget &Subtarget) {
32705 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
32708 if (DCI.isBeforeLegalizeOps())
32711 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
32714 if (Subtarget.hasCMov())
32715 if (SDValue RV = combineIntegerAbs(N, DAG))
32718 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32722 return combineFneg(N, DAG, Subtarget);
32727 static bool isNullFPScalarOrVectorConst(SDValue V) {
32728 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
32731 /// If a value is a scalar FP zero or a vector FP zero (potentially including
32732 /// undefined elements), return a zero constant that may be used to fold away
32733 /// that value. In the case of a vector, the returned constant will not contain
32734 /// undefined elements even if the input parameter does. This makes it suitable
32735 /// to be used as a replacement operand with operations (eg, bitwise-and) where
32736 /// an undef should not propagate.
32737 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
32738 const X86Subtarget &Subtarget) {
32739 if (!isNullFPScalarOrVectorConst(V))
32742 if (V.getValueType().isVector())
32743 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
32748 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
32749 const X86Subtarget &Subtarget) {
32750 SDValue N0 = N->getOperand(0);
32751 SDValue N1 = N->getOperand(1);
32752 EVT VT = N->getValueType(0);
32755 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
32756 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
32757 (VT == MVT::f64 && Subtarget.hasSSE2())))
32760 auto isAllOnesConstantFP = [](SDValue V) {
32761 auto *C = dyn_cast<ConstantFPSDNode>(V);
32762 return C && C->getConstantFPValue()->isAllOnesValue();
32765 // fand (fxor X, -1), Y --> fandn X, Y
32766 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
32767 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
32769 // fand X, (fxor Y, -1) --> fandn Y, X
32770 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
32771 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
32776 /// Do target-specific dag combines on X86ISD::FAND nodes.
32777 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
32778 const X86Subtarget &Subtarget) {
32779 // FAND(0.0, x) -> 0.0
32780 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
32783 // FAND(x, 0.0) -> 0.0
32784 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32787 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
32790 return lowerX86FPLogicOp(N, DAG, Subtarget);
32793 /// Do target-specific dag combines on X86ISD::FANDN nodes.
32794 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
32795 const X86Subtarget &Subtarget) {
32796 // FANDN(0.0, x) -> x
32797 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32798 return N->getOperand(1);
32800 // FANDN(x, 0.0) -> 0.0
32801 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
32804 return lowerX86FPLogicOp(N, DAG, Subtarget);
32807 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
32808 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
32809 const X86Subtarget &Subtarget) {
32810 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
32812 // F[X]OR(0.0, x) -> x
32813 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
32814 return N->getOperand(1);
32816 // F[X]OR(x, 0.0) -> x
32817 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
32818 return N->getOperand(0);
32821 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
32824 return lowerX86FPLogicOp(N, DAG, Subtarget);
32827 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
32828 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
32829 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
32831 // Only perform optimizations if UnsafeMath is used.
32832 if (!DAG.getTarget().Options.UnsafeFPMath)
32835 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
32836 // into FMINC and FMAXC, which are Commutative operations.
32837 unsigned NewOp = 0;
32838 switch (N->getOpcode()) {
32839 default: llvm_unreachable("unknown opcode");
32840 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
32841 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
32844 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
32845 N->getOperand(0), N->getOperand(1));
32848 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
32849 const X86Subtarget &Subtarget) {
32850 if (Subtarget.useSoftFloat())
32853 // TODO: Check for global or instruction-level "nnan". In that case, we
32854 // should be able to lower to FMAX/FMIN alone.
32855 // TODO: If an operand is already known to be a NaN or not a NaN, this
32856 // should be an optional swap and FMAX/FMIN.
32858 EVT VT = N->getValueType(0);
32859 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
32860 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
32861 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
32864 // This takes at least 3 instructions, so favor a library call when operating
32865 // on a scalar and minimizing code size.
32866 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
32869 SDValue Op0 = N->getOperand(0);
32870 SDValue Op1 = N->getOperand(1);
32872 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
32873 DAG.getDataLayout(), *DAG.getContext(), VT);
32875 // There are 4 possibilities involving NaN inputs, and these are the required
32879 // ----------------
32880 // Num | Max | Op0 |
32881 // Op0 ----------------
32882 // NaN | Op1 | NaN |
32883 // ----------------
32885 // The SSE FP max/min instructions were not designed for this case, but rather
32887 // Min = Op1 < Op0 ? Op1 : Op0
32888 // Max = Op1 > Op0 ? Op1 : Op0
32890 // So they always return Op0 if either input is a NaN. However, we can still
32891 // use those instructions for fmaxnum by selecting away a NaN input.
32893 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
32894 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
32895 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
32896 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
32898 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
32899 // are NaN, the NaN value of Op1 is the result.
32900 auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
32901 return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax);
32904 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
32905 TargetLowering::DAGCombinerInfo &DCI) {
32906 // BT ignores high bits in the bit index operand.
32907 SDValue Op1 = N->getOperand(1);
32908 if (Op1.hasOneUse()) {
32909 unsigned BitWidth = Op1.getValueSizeInBits();
32910 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
32911 APInt KnownZero, KnownOne;
32912 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
32913 !DCI.isBeforeLegalizeOps());
32914 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32915 if (TLO.ShrinkDemandedConstant(Op1, DemandedMask) ||
32916 TLI.SimplifyDemandedBits(Op1, DemandedMask, KnownZero, KnownOne, TLO))
32917 DCI.CommitTargetLoweringOpt(TLO);
32922 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
32923 const X86Subtarget &Subtarget) {
32924 EVT VT = N->getValueType(0);
32925 if (!VT.isVector())
32928 SDValue N0 = N->getOperand(0);
32929 SDValue N1 = N->getOperand(1);
32930 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
32933 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
32934 // both SSE and AVX2 since there is no sign-extended shift right
32935 // operation on a vector with 64-bit elements.
32936 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
32937 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
32938 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
32939 N0.getOpcode() == ISD::SIGN_EXTEND)) {
32940 SDValue N00 = N0.getOperand(0);
32942 // EXTLOAD has a better solution on AVX2,
32943 // it may be replaced with X86ISD::VSEXT node.
32944 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
32945 if (!ISD::isNormalLoad(N00.getNode()))
32948 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
32949 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
32951 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
32957 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
32958 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
32959 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
32960 /// opportunities to combine math ops, use an LEA, or use a complex addressing
32961 /// mode. This can eliminate extend, add, and shift instructions.
32962 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
32963 const X86Subtarget &Subtarget) {
32964 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
32965 Ext->getOpcode() != ISD::ZERO_EXTEND)
32968 // TODO: This should be valid for other integer types.
32969 EVT VT = Ext->getValueType(0);
32970 if (VT != MVT::i64)
32973 SDValue Add = Ext->getOperand(0);
32974 if (Add.getOpcode() != ISD::ADD)
32977 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
32978 bool NSW = Add->getFlags()->hasNoSignedWrap();
32979 bool NUW = Add->getFlags()->hasNoUnsignedWrap();
32981 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
32983 if ((Sext && !NSW) || (!Sext && !NUW))
32986 // Having a constant operand to the 'add' ensures that we are not increasing
32987 // the instruction count because the constant is extended for free below.
32988 // A constant operand can also become the displacement field of an LEA.
32989 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
32993 // Don't make the 'add' bigger if there's no hope of combining it with some
32994 // other 'add' or 'shl' instruction.
32995 // TODO: It may be profitable to generate simpler LEA instructions in place
32996 // of single 'add' instructions, but the cost model for selecting an LEA
32997 // currently has a high threshold.
32998 bool HasLEAPotential = false;
32999 for (auto *User : Ext->uses()) {
33000 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
33001 HasLEAPotential = true;
33005 if (!HasLEAPotential)
33008 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
33009 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
33010 SDValue AddOp0 = Add.getOperand(0);
33011 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
33012 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
33014 // The wider add is guaranteed to not wrap because both operands are
33017 Flags.setNoSignedWrap(NSW);
33018 Flags.setNoUnsignedWrap(NUW);
33019 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, &Flags);
33022 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
33023 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
33024 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
33025 /// extends from AH (which we otherwise need to do contortions to access).
33026 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
33027 SDValue N0 = N->getOperand(0);
33028 auto OpcodeN = N->getOpcode();
33029 auto OpcodeN0 = N0.getOpcode();
33030 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
33031 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
33034 EVT VT = N->getValueType(0);
33035 EVT InVT = N0.getValueType();
33036 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
33039 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
33040 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
33041 : X86ISD::UDIVREM8_ZEXT_HREG;
33042 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
33044 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
33045 return R.getValue(1);
33048 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
33049 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
33050 /// with UNDEFs) of the input to vectors of the same size as the target type
33051 /// which then extends the lowest elements.
33052 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
33053 TargetLowering::DAGCombinerInfo &DCI,
33054 const X86Subtarget &Subtarget) {
33055 unsigned Opcode = N->getOpcode();
33056 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
33058 if (!DCI.isBeforeLegalizeOps())
33060 if (!Subtarget.hasSSE2())
33063 SDValue N0 = N->getOperand(0);
33064 EVT VT = N->getValueType(0);
33065 EVT SVT = VT.getScalarType();
33066 EVT InVT = N0.getValueType();
33067 EVT InSVT = InVT.getScalarType();
33069 // Input type must be a vector and we must be extending legal integer types.
33070 if (!VT.isVector())
33072 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
33074 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
33077 // On AVX2+ targets, if the input/output types are both legal then we will be
33078 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
33079 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
33080 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
33085 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
33086 EVT InVT = N.getValueType();
33087 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
33088 Size / InVT.getScalarSizeInBits());
33089 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
33090 DAG.getUNDEF(InVT));
33092 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
33095 // If target-size is less than 128-bits, extend to a type that would extend
33096 // to 128 bits, extend that and extract the original target vector.
33097 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
33098 unsigned Scale = 128 / VT.getSizeInBits();
33100 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
33101 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
33102 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
33103 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
33104 DAG.getIntPtrConstant(0, DL));
33107 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
33108 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
33109 // Also use this if we don't have SSE41 to allow the legalizer do its job.
33110 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
33111 (VT.is256BitVector() && Subtarget.hasInt256()) ||
33112 (VT.is512BitVector() && Subtarget.hasAVX512())) {
33113 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
33114 return Opcode == ISD::SIGN_EXTEND
33115 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
33116 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
33119 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
33120 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
33121 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
33122 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
33123 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
33125 SmallVector<SDValue, 8> Opnds;
33126 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
33127 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
33128 DAG.getIntPtrConstant(Offset, DL));
33129 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
33130 SrcVec = Opcode == ISD::SIGN_EXTEND
33131 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
33132 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
33133 Opnds.push_back(SrcVec);
33135 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
33138 // On pre-AVX2 targets, split into 128-bit nodes of
33139 // ISD::*_EXTEND_VECTOR_INREG.
33140 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
33141 return SplitAndExtendInReg(128);
33143 // On pre-AVX512 targets, split into 256-bit nodes of
33144 // ISD::*_EXTEND_VECTOR_INREG.
33145 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
33146 return SplitAndExtendInReg(256);
33151 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
33152 TargetLowering::DAGCombinerInfo &DCI,
33153 const X86Subtarget &Subtarget) {
33154 SDValue N0 = N->getOperand(0);
33155 EVT VT = N->getValueType(0);
33156 EVT InVT = N0.getValueType();
33159 if (SDValue DivRem8 = getDivRem8(N, DAG))
33162 if (!DCI.isBeforeLegalizeOps()) {
33163 if (InVT == MVT::i1) {
33164 SDValue Zero = DAG.getConstant(0, DL, VT);
33166 DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
33167 return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
33172 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
33175 if (Subtarget.hasAVX() && VT.is256BitVector())
33176 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
33179 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
33185 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
33186 const X86Subtarget &Subtarget) {
33188 EVT VT = N->getValueType(0);
33190 // Let legalize expand this if it isn't a legal type yet.
33191 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33194 EVT ScalarVT = VT.getScalarType();
33195 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
33198 SDValue A = N->getOperand(0);
33199 SDValue B = N->getOperand(1);
33200 SDValue C = N->getOperand(2);
33202 auto invertIfNegative = [](SDValue &V) {
33203 if (SDValue NegVal = isFNEG(V.getNode())) {
33210 // Do not convert the passthru input of scalar intrinsics.
33211 // FIXME: We could allow negations of the lower element only.
33212 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
33213 bool NegB = invertIfNegative(B);
33214 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
33216 // Negative multiplication when NegA xor NegB
33217 bool NegMul = (NegA != NegB);
33219 unsigned NewOpcode;
33221 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
33223 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
33226 if (N->getOpcode() == X86ISD::FMADD_RND) {
33227 switch (NewOpcode) {
33228 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
33229 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
33230 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
33231 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
33233 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
33234 switch (NewOpcode) {
33235 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
33236 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
33237 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
33238 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
33240 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
33241 switch (NewOpcode) {
33242 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
33243 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
33244 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
33245 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
33248 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
33249 "Unexpected opcode!");
33250 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
33253 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
33256 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
33257 TargetLowering::DAGCombinerInfo &DCI,
33258 const X86Subtarget &Subtarget) {
33259 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
33260 // (and (i32 x86isd::setcc_carry), 1)
33261 // This eliminates the zext. This transformation is necessary because
33262 // ISD::SETCC is always legalized to i8.
33264 SDValue N0 = N->getOperand(0);
33265 EVT VT = N->getValueType(0);
33267 if (N0.getOpcode() == ISD::AND &&
33269 N0.getOperand(0).hasOneUse()) {
33270 SDValue N00 = N0.getOperand(0);
33271 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33272 if (!isOneConstant(N0.getOperand(1)))
33274 return DAG.getNode(ISD::AND, dl, VT,
33275 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
33276 N00.getOperand(0), N00.getOperand(1)),
33277 DAG.getConstant(1, dl, VT));
33281 if (N0.getOpcode() == ISD::TRUNCATE &&
33283 N0.getOperand(0).hasOneUse()) {
33284 SDValue N00 = N0.getOperand(0);
33285 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
33286 return DAG.getNode(ISD::AND, dl, VT,
33287 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
33288 N00.getOperand(0), N00.getOperand(1)),
33289 DAG.getConstant(1, dl, VT));
33293 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
33296 if (VT.is256BitVector())
33297 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
33300 if (SDValue DivRem8 = getDivRem8(N, DAG))
33303 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
33306 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
33312 /// Optimize x == -y --> x+y == 0
33313 /// x != -y --> x+y != 0
33314 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
33315 const X86Subtarget &Subtarget) {
33316 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
33317 SDValue LHS = N->getOperand(0);
33318 SDValue RHS = N->getOperand(1);
33319 EVT VT = N->getValueType(0);
33322 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
33323 if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
33324 SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
33325 LHS.getOperand(1));
33326 return DAG.getSetCC(DL, N->getValueType(0), addV,
33327 DAG.getConstant(0, DL, addV.getValueType()), CC);
33329 if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
33330 if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
33331 SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
33332 RHS.getOperand(1));
33333 return DAG.getSetCC(DL, N->getValueType(0), addV,
33334 DAG.getConstant(0, DL, addV.getValueType()), CC);
33337 if (VT.getScalarType() == MVT::i1 &&
33338 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
33340 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
33341 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
33342 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
33344 if (!IsSEXT0 || !IsVZero1) {
33345 // Swap the operands and update the condition code.
33346 std::swap(LHS, RHS);
33347 CC = ISD::getSetCCSwappedOperands(CC);
33349 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
33350 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
33351 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
33354 if (IsSEXT0 && IsVZero1) {
33355 assert(VT == LHS.getOperand(0).getValueType() &&
33356 "Uexpected operand type");
33357 if (CC == ISD::SETGT)
33358 return DAG.getConstant(0, DL, VT);
33359 if (CC == ISD::SETLE)
33360 return DAG.getConstant(1, DL, VT);
33361 if (CC == ISD::SETEQ || CC == ISD::SETGE)
33362 return DAG.getNOT(DL, LHS.getOperand(0), VT);
33364 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
33365 "Unexpected condition code!");
33366 return LHS.getOperand(0);
33370 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
33371 // to avoid scalarization via legalization because v4i32 is not a legal type.
33372 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
33373 LHS.getValueType() == MVT::v4f32)
33374 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
33379 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
33381 // Gather and Scatter instructions use k-registers for masks. The type of
33382 // the masks is v*i1. So the mask will be truncated anyway.
33383 // The SIGN_EXTEND_INREG my be dropped.
33384 SDValue Mask = N->getOperand(2);
33385 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
33386 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
33387 NewOps[2] = Mask.getOperand(0);
33388 DAG.UpdateNodeOperands(N, NewOps);
33393 // Helper function of performSETCCCombine. It is to materialize "setb reg"
33394 // as "sbb reg,reg", since it can be extended without zext and produces
33395 // an all-ones bit which is more useful than 0/1 in some cases.
33396 static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
33397 SelectionDAG &DAG, MVT VT) {
33399 return DAG.getNode(ISD::AND, DL, VT,
33400 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
33401 DAG.getConstant(X86::COND_B, DL, MVT::i8),
33403 DAG.getConstant(1, DL, VT));
33404 assert (VT == MVT::i1 && "Unexpected type for SECCC node");
33405 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
33406 DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
33407 DAG.getConstant(X86::COND_B, DL, MVT::i8),
33411 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
33412 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
33413 TargetLowering::DAGCombinerInfo &DCI,
33414 const X86Subtarget &Subtarget) {
33416 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
33417 SDValue EFLAGS = N->getOperand(1);
33419 if (CC == X86::COND_A) {
33420 // Try to convert COND_A into COND_B in an attempt to facilitate
33421 // materializing "setb reg".
33423 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
33424 // cannot take an immediate as its first operand.
33426 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
33427 EFLAGS.getValueType().isInteger() &&
33428 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
33429 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
33430 EFLAGS.getNode()->getVTList(),
33431 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
33432 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
33433 return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
33437 // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
33438 // a zext and produces an all-ones bit which is more useful than 0/1 in some
33440 if (CC == X86::COND_B)
33441 return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
33443 // Try to simplify the EFLAGS and condition code operands.
33444 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
33445 return getSETCC(CC, Flags, DL, DAG);
33450 /// Optimize branch condition evaluation.
33451 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
33452 TargetLowering::DAGCombinerInfo &DCI,
33453 const X86Subtarget &Subtarget) {
33455 SDValue EFLAGS = N->getOperand(3);
33456 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
33458 // Try to simplify the EFLAGS and condition code operands.
33459 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
33460 // RAUW them under us.
33461 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
33462 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
33463 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
33464 N->getOperand(1), Cond, Flags);
33470 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
33471 SelectionDAG &DAG) {
33472 // Take advantage of vector comparisons producing 0 or -1 in each lane to
33473 // optimize away operation when it's from a constant.
33475 // The general transformation is:
33476 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
33477 // AND(VECTOR_CMP(x,y), constant2)
33478 // constant2 = UNARYOP(constant)
33480 // Early exit if this isn't a vector operation, the operand of the
33481 // unary operation isn't a bitwise AND, or if the sizes of the operations
33482 // aren't the same.
33483 EVT VT = N->getValueType(0);
33484 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
33485 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
33486 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
33489 // Now check that the other operand of the AND is a constant. We could
33490 // make the transformation for non-constant splats as well, but it's unclear
33491 // that would be a benefit as it would not eliminate any operations, just
33492 // perform one more step in scalar code before moving to the vector unit.
33493 if (BuildVectorSDNode *BV =
33494 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
33495 // Bail out if the vector isn't a constant.
33496 if (!BV->isConstant())
33499 // Everything checks out. Build up the new and improved node.
33501 EVT IntVT = BV->getValueType(0);
33502 // Create a new constant of the appropriate type for the transformed
33504 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
33505 // The AND node needs bitcasts to/from an integer vector type around it.
33506 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
33507 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
33508 N->getOperand(0)->getOperand(0), MaskConst);
33509 SDValue Res = DAG.getBitcast(VT, NewAnd);
33516 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
33517 const X86Subtarget &Subtarget) {
33518 SDValue Op0 = N->getOperand(0);
33519 EVT VT = N->getValueType(0);
33520 EVT InVT = Op0.getValueType();
33521 EVT InSVT = InVT.getScalarType();
33522 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33524 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
33525 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
33526 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
33528 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33529 InVT.getVectorNumElements());
33530 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
33532 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
33533 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
33535 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33538 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
33539 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
33540 // the optimization here.
33541 if (DAG.SignBitIsZero(Op0))
33542 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
33547 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
33548 const X86Subtarget &Subtarget) {
33549 // First try to optimize away the conversion entirely when it's
33550 // conditionally from a constant. Vectors only.
33551 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
33554 // Now move on to more general possibilities.
33555 SDValue Op0 = N->getOperand(0);
33556 EVT VT = N->getValueType(0);
33557 EVT InVT = Op0.getValueType();
33558 EVT InSVT = InVT.getScalarType();
33560 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
33561 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
33562 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
33563 if (InVT.isVector() &&
33564 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
33565 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
33567 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
33568 InVT.getVectorNumElements());
33569 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
33570 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
33573 // Without AVX512DQ we only support i64 to float scalar conversion. For both
33574 // vectors and scalars, see if we know that the upper bits are all the sign
33575 // bit, in which case we can truncate the input to i32 and convert from that.
33576 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
33577 unsigned BitWidth = InVT.getScalarSizeInBits();
33578 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
33579 if (NumSignBits >= (BitWidth - 31)) {
33580 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
33581 if (InVT.isVector())
33582 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
33583 InVT.getVectorNumElements());
33585 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
33586 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
33590 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
33591 // a 32-bit target where SSE doesn't support i64->FP operations.
33592 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
33593 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
33594 EVT LdVT = Ld->getValueType(0);
33596 // This transformation is not supported if the result type is f16 or f128.
33597 if (VT == MVT::f16 || VT == MVT::f128)
33600 if (!Ld->isVolatile() && !VT.isVector() &&
33601 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
33602 !Subtarget.is64Bit() && LdVT == MVT::i64) {
33603 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
33604 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
33605 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
33612 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
33613 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
33614 X86TargetLowering::DAGCombinerInfo &DCI) {
33615 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
33616 // the result is either zero or one (depending on the input carry bit).
33617 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
33618 if (X86::isZeroNode(N->getOperand(0)) &&
33619 X86::isZeroNode(N->getOperand(1)) &&
33620 // We don't have a good way to replace an EFLAGS use, so only do this when
33622 SDValue(N, 1).use_empty()) {
33624 EVT VT = N->getValueType(0);
33625 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
33626 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
33627 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
33628 DAG.getConstant(X86::COND_B, DL,
33631 DAG.getConstant(1, DL, VT));
33632 return DCI.CombineTo(N, Res1, CarryOut);
33638 /// fold (add Y, (sete X, 0)) -> adc 0, Y
33639 /// (add Y, (setne X, 0)) -> sbb -1, Y
33640 /// (sub (sete X, 0), Y) -> sbb 0, Y
33641 /// (sub (setne X, 0), Y) -> adc -1, Y
33642 static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
33645 // Look through ZExts.
33646 SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
33647 if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
33650 SDValue SetCC = Ext.getOperand(0);
33651 if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
33654 X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
33655 if (CC != X86::COND_E && CC != X86::COND_NE)
33658 SDValue Cmp = SetCC.getOperand(1);
33659 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
33660 !X86::isZeroNode(Cmp.getOperand(1)) ||
33661 !Cmp.getOperand(0).getValueType().isInteger())
33664 SDValue CmpOp0 = Cmp.getOperand(0);
33665 SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
33666 DAG.getConstant(1, DL, CmpOp0.getValueType()));
33668 SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
33669 if (CC == X86::COND_NE)
33670 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
33671 DL, OtherVal.getValueType(), OtherVal,
33672 DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
33674 return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
33675 DL, OtherVal.getValueType(), OtherVal,
33676 DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
33679 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
33680 const X86Subtarget &Subtarget) {
33682 EVT VT = N->getValueType(0);
33683 SDValue Op0 = N->getOperand(0);
33684 SDValue Op1 = N->getOperand(1);
33686 // TODO: There's nothing special about i32, any integer type above i16 should
33687 // work just as well.
33688 if (!VT.isVector() || !VT.isSimple() ||
33689 !(VT.getVectorElementType() == MVT::i32))
33692 unsigned RegSize = 128;
33693 if (Subtarget.hasBWI())
33695 else if (Subtarget.hasAVX2())
33698 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
33699 // TODO: We should be able to handle larger vectors by splitting them before
33700 // feeding them into several SADs, and then reducing over those.
33701 if (VT.getSizeInBits() / 4 > RegSize)
33704 // We know N is a reduction add, which means one of its operands is a phi.
33705 // To match SAD, we need the other operand to be a vector select.
33706 SDValue SelectOp, Phi;
33707 if (Op0.getOpcode() == ISD::VSELECT) {
33710 } else if (Op1.getOpcode() == ISD::VSELECT) {
33716 // Check whether we have an abs-diff pattern feeding into the select.
33717 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
33720 // SAD pattern detected. Now build a SAD instruction and an addition for
33721 // reduction. Note that the number of elements of the result of SAD is less
33722 // than the number of elements of its input. Therefore, we could only update
33723 // part of elements in the reduction vector.
33724 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
33726 // The output of PSADBW is a vector of i64.
33727 // We need to turn the vector of i64 into a vector of i32.
33728 // If the reduction vector is at least as wide as the psadbw result, just
33729 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
33731 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
33732 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
33733 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
33735 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
33737 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
33738 // Update part of elements of the reduction vector. This is done by first
33739 // extracting a sub-vector from it, updating this sub-vector, and inserting
33741 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
33742 DAG.getIntPtrConstant(0, DL));
33743 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
33744 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
33745 DAG.getIntPtrConstant(0, DL));
33747 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
33750 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
33751 const X86Subtarget &Subtarget) {
33752 const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
33753 if (Flags->hasVectorReduction()) {
33754 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
33757 EVT VT = N->getValueType(0);
33758 SDValue Op0 = N->getOperand(0);
33759 SDValue Op1 = N->getOperand(1);
33761 // Try to synthesize horizontal adds from adds of shuffles.
33762 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33763 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33764 isHorizontalBinOp(Op0, Op1, true))
33765 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
33767 return OptimizeConditionalInDecrement(N, DAG);
33770 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
33771 const X86Subtarget &Subtarget) {
33772 SDValue Op0 = N->getOperand(0);
33773 SDValue Op1 = N->getOperand(1);
33775 // X86 can't encode an immediate LHS of a sub. See if we can push the
33776 // negation into a preceding instruction.
33777 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
33778 // If the RHS of the sub is a XOR with one use and a constant, invert the
33779 // immediate. Then add one to the LHS of the sub so we can turn
33780 // X-Y -> X+~Y+1, saving one register.
33781 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
33782 isa<ConstantSDNode>(Op1.getOperand(1))) {
33783 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
33784 EVT VT = Op0.getValueType();
33785 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
33787 DAG.getConstant(~XorC, SDLoc(Op1), VT));
33788 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
33789 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
33793 // Try to synthesize horizontal adds from adds of shuffles.
33794 EVT VT = N->getValueType(0);
33795 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
33796 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
33797 isHorizontalBinOp(Op0, Op1, true))
33798 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
33800 return OptimizeConditionalInDecrement(N, DAG);
33803 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
33804 TargetLowering::DAGCombinerInfo &DCI,
33805 const X86Subtarget &Subtarget) {
33807 unsigned Opcode = N->getOpcode();
33808 MVT VT = N->getSimpleValueType(0);
33809 MVT SVT = VT.getVectorElementType();
33810 SDValue Op = N->getOperand(0);
33811 MVT OpVT = Op.getSimpleValueType();
33812 MVT OpEltVT = OpVT.getVectorElementType();
33813 unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
33815 // Perform any constant folding.
33816 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
33817 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
33818 unsigned NumDstElts = VT.getVectorNumElements();
33819 SmallBitVector Undefs(NumDstElts, false);
33820 SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
33821 for (unsigned i = 0; i != NumDstElts; ++i) {
33822 SDValue OpElt = Op.getOperand(i);
33823 if (OpElt.getOpcode() == ISD::UNDEF) {
33827 APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
33828 Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
33829 : Cst.sextOrTrunc(SVT.getSizeInBits());
33831 return getConstVector(Vals, Undefs, VT, DAG, DL);
33834 // (vzext (bitcast (vzext (x)) -> (vzext x)
33835 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
33836 SDValue V = peekThroughBitcasts(Op);
33837 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
33838 MVT InnerVT = V.getSimpleValueType();
33839 MVT InnerEltVT = InnerVT.getVectorElementType();
33841 // If the element sizes match exactly, we can just do one larger vzext. This
33842 // is always an exact type match as vzext operates on integer types.
33843 if (OpEltVT == InnerEltVT) {
33844 assert(OpVT == InnerVT && "Types must match for vzext!");
33845 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
33848 // The only other way we can combine them is if only a single element of the
33849 // inner vzext is used in the input to the outer vzext.
33850 if (InnerEltVT.getSizeInBits() < InputBits)
33853 // In this case, the inner vzext is completely dead because we're going to
33854 // only look at bits inside of the low element. Just do the outer vzext on
33855 // a bitcast of the input to the inner.
33856 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
33859 // Check if we can bypass extracting and re-inserting an element of an input
33860 // vector. Essentially:
33861 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
33862 // TODO: Add X86ISD::VSEXT support
33863 if (Opcode == X86ISD::VZEXT &&
33864 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
33865 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
33866 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
33867 SDValue ExtractedV = V.getOperand(0);
33868 SDValue OrigV = ExtractedV.getOperand(0);
33869 if (isNullConstant(ExtractedV.getOperand(1))) {
33870 MVT OrigVT = OrigV.getSimpleValueType();
33871 // Extract a subvector if necessary...
33872 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
33873 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
33874 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
33875 OrigVT.getVectorNumElements() / Ratio);
33876 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
33877 DAG.getIntPtrConstant(0, DL));
33879 Op = DAG.getBitcast(OpVT, OrigV);
33880 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
33887 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
33888 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
33889 const X86Subtarget &Subtarget) {
33890 SDValue Chain = N->getOperand(0);
33891 SDValue LHS = N->getOperand(1);
33892 SDValue RHS = N->getOperand(2);
33893 MVT VT = RHS.getSimpleValueType();
33896 auto *C = dyn_cast<ConstantSDNode>(RHS);
33897 if (!C || C->getZExtValue() != 1)
33900 RHS = DAG.getConstant(-1, DL, VT);
33901 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
33902 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
33903 DAG.getVTList(MVT::i32, MVT::Other),
33904 {Chain, LHS, RHS}, VT, MMO);
33907 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
33908 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
33909 SDValue Op0 = N->getOperand(0);
33910 SDValue Op1 = N->getOperand(1);
33912 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
33915 EVT VT = N->getValueType(0);
33918 return DAG.getNode(X86ISD::TESTM, DL, VT,
33919 Op0->getOperand(0), Op0->getOperand(1));
33922 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
33923 const X86Subtarget &Subtarget) {
33924 MVT VT = N->getSimpleValueType(0);
33927 if (N->getOperand(0) == N->getOperand(1)) {
33928 if (N->getOpcode() == X86ISD::PCMPEQ)
33929 return getOnesVector(VT, Subtarget, DAG, DL);
33930 if (N->getOpcode() == X86ISD::PCMPGT)
33931 return getZeroVector(VT, Subtarget, DAG, DL);
33938 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
33939 DAGCombinerInfo &DCI) const {
33940 SelectionDAG &DAG = DCI.DAG;
33941 switch (N->getOpcode()) {
33943 case ISD::EXTRACT_VECTOR_ELT:
33944 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
33947 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
33948 case ISD::BITCAST: return combineBitcast(N, DAG, Subtarget);
33949 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
33950 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
33951 case ISD::SUB: return combineSub(N, DAG, Subtarget);
33952 case X86ISD::ADC: return combineADC(N, DAG, DCI);
33953 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
33956 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
33957 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
33958 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
33959 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
33960 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
33961 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
33962 case ISD::STORE: return combineStore(N, DAG, Subtarget);
33963 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
33964 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
33965 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
33967 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
33968 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
33969 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
33970 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
33971 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
33973 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
33975 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
33977 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
33978 case X86ISD::BT: return combineBT(N, DAG, DCI);
33979 case ISD::ANY_EXTEND:
33980 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
33981 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
33982 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
33983 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
33984 case X86ISD::SETCC: return combineX86SetCC(N, DAG, DCI, Subtarget);
33985 case X86ISD::BRCOND: return combineBrCond(N, DAG, DCI, Subtarget);
33986 case X86ISD::VSHLI:
33987 case X86ISD::VSRLI: return combineVectorShift(N, DAG, DCI, Subtarget);
33988 case X86ISD::VSEXT:
33989 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
33990 case X86ISD::SHUFP: // Handle all target specific shuffles
33991 case X86ISD::INSERTPS:
33992 case X86ISD::PALIGNR:
33993 case X86ISD::VSHLDQ:
33994 case X86ISD::VSRLDQ:
33995 case X86ISD::BLENDI:
33996 case X86ISD::UNPCKH:
33997 case X86ISD::UNPCKL:
33998 case X86ISD::MOVHLPS:
33999 case X86ISD::MOVLHPS:
34000 case X86ISD::PSHUFB:
34001 case X86ISD::PSHUFD:
34002 case X86ISD::PSHUFHW:
34003 case X86ISD::PSHUFLW:
34004 case X86ISD::MOVSHDUP:
34005 case X86ISD::MOVSLDUP:
34006 case X86ISD::MOVDDUP:
34007 case X86ISD::MOVSS:
34008 case X86ISD::MOVSD:
34009 case X86ISD::VPPERM:
34010 case X86ISD::VPERMI:
34011 case X86ISD::VPERMV:
34012 case X86ISD::VPERMV3:
34013 case X86ISD::VPERMIV3:
34014 case X86ISD::VPERMIL2:
34015 case X86ISD::VPERMILPI:
34016 case X86ISD::VPERMILPV:
34017 case X86ISD::VPERM2X128:
34018 case X86ISD::VZEXT_MOVL:
34019 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
34020 case X86ISD::FMADD:
34021 case X86ISD::FMADD_RND:
34022 case X86ISD::FMADDS1_RND:
34023 case X86ISD::FMADDS3_RND:
34024 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
34026 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
34027 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
34028 case X86ISD::TESTM: return combineTestM(N, DAG);
34029 case X86ISD::PCMPEQ:
34030 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
34036 /// Return true if the target has native support for the specified value type
34037 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
34038 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
34039 /// some i16 instructions are slow.
34040 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
34041 if (!isTypeLegal(VT))
34043 if (VT != MVT::i16)
34050 case ISD::SIGN_EXTEND:
34051 case ISD::ZERO_EXTEND:
34052 case ISD::ANY_EXTEND:
34065 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
34066 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
34067 /// we don't adjust the stack we clobber the first frame index.
34068 /// See X86InstrInfo::copyPhysReg.
34069 bool X86TargetLowering::hasCopyImplyingStackAdjustment(
34070 MachineFunction *MF) const {
34071 const MachineRegisterInfo &MRI = MF->getRegInfo();
34073 return any_of(MRI.reg_instructions(X86::EFLAGS),
34074 [](const MachineInstr &RI) { return RI.isCopy(); });
34077 /// This method query the target whether it is beneficial for dag combiner to
34078 /// promote the specified node. If true, it should return the desired promotion
34079 /// type by reference.
34080 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
34081 EVT VT = Op.getValueType();
34082 if (VT != MVT::i16)
34085 bool Promote = false;
34086 bool Commute = false;
34087 switch (Op.getOpcode()) {
34089 case ISD::SIGN_EXTEND:
34090 case ISD::ZERO_EXTEND:
34091 case ISD::ANY_EXTEND:
34096 SDValue N0 = Op.getOperand(0);
34097 // Look out for (store (shl (load), x)).
34098 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
34111 SDValue N0 = Op.getOperand(0);
34112 SDValue N1 = Op.getOperand(1);
34113 if (!Commute && MayFoldLoad(N1))
34115 // Avoid disabling potential load folding opportunities.
34116 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
34118 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
34128 //===----------------------------------------------------------------------===//
34129 // X86 Inline Assembly Support
34130 //===----------------------------------------------------------------------===//
34132 // Helper to match a string separated by whitespace.
34133 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
34134 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
34136 for (StringRef Piece : Pieces) {
34137 if (!S.startswith(Piece)) // Check if the piece matches.
34140 S = S.substr(Piece.size());
34141 StringRef::size_type Pos = S.find_first_not_of(" \t");
34142 if (Pos == 0) // We matched a prefix.
34151 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
34153 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
34154 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
34155 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
34156 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
34158 if (AsmPieces.size() == 3)
34160 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
34167 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
34168 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
34170 const std::string &AsmStr = IA->getAsmString();
34172 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
34173 if (!Ty || Ty->getBitWidth() % 16 != 0)
34176 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
34177 SmallVector<StringRef, 4> AsmPieces;
34178 SplitString(AsmStr, AsmPieces, ";\n");
34180 switch (AsmPieces.size()) {
34181 default: return false;
34183 // FIXME: this should verify that we are targeting a 486 or better. If not,
34184 // we will turn this bswap into something that will be lowered to logical
34185 // ops instead of emitting the bswap asm. For now, we don't support 486 or
34186 // lower so don't worry about this.
34188 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
34189 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
34190 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
34191 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
34192 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
34193 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
34194 // No need to check constraints, nothing other than the equivalent of
34195 // "=r,0" would be valid here.
34196 return IntrinsicLowering::LowerToByteSwap(CI);
34199 // rorw $$8, ${0:w} --> llvm.bswap.i16
34200 if (CI->getType()->isIntegerTy(16) &&
34201 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
34202 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
34203 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
34205 StringRef ConstraintsStr = IA->getConstraintString();
34206 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
34207 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
34208 if (clobbersFlagRegisters(AsmPieces))
34209 return IntrinsicLowering::LowerToByteSwap(CI);
34213 if (CI->getType()->isIntegerTy(32) &&
34214 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
34215 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
34216 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
34217 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
34219 StringRef ConstraintsStr = IA->getConstraintString();
34220 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
34221 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
34222 if (clobbersFlagRegisters(AsmPieces))
34223 return IntrinsicLowering::LowerToByteSwap(CI);
34226 if (CI->getType()->isIntegerTy(64)) {
34227 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
34228 if (Constraints.size() >= 2 &&
34229 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
34230 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
34231 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
34232 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
34233 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
34234 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
34235 return IntrinsicLowering::LowerToByteSwap(CI);
34243 /// Given a constraint letter, return the type of constraint for this target.
34244 X86TargetLowering::ConstraintType
34245 X86TargetLowering::getConstraintType(StringRef Constraint) const {
34246 if (Constraint.size() == 1) {
34247 switch (Constraint[0]) {
34259 return C_RegisterClass;
34260 case 'k': // AVX512 masking registers.
34284 else if (Constraint.size() == 2) {
34285 switch (Constraint[0]) {
34289 switch (Constraint[1]) {
34297 return TargetLowering::getConstraintType(Constraint);
34300 /// Examine constraint type and operand type and determine a weight value.
34301 /// This object must already have been set up with the operand type
34302 /// and the current alternative constraint selected.
34303 TargetLowering::ConstraintWeight
34304 X86TargetLowering::getSingleConstraintMatchWeight(
34305 AsmOperandInfo &info, const char *constraint) const {
34306 ConstraintWeight weight = CW_Invalid;
34307 Value *CallOperandVal = info.CallOperandVal;
34308 // If we don't have a value, we can't do a match,
34309 // but allow it at the lowest weight.
34310 if (!CallOperandVal)
34312 Type *type = CallOperandVal->getType();
34313 // Look at the constraint type.
34314 switch (*constraint) {
34316 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
34327 if (CallOperandVal->getType()->isIntegerTy())
34328 weight = CW_SpecificReg;
34333 if (type->isFloatingPointTy())
34334 weight = CW_SpecificReg;
34337 if (type->isX86_MMXTy() && Subtarget.hasMMX())
34338 weight = CW_SpecificReg;
34341 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
34342 if (constraint[1] == 'k') {
34343 // Support for 'Yk' (similarly to the 'k' variant below).
34344 weight = CW_SpecificReg;
34347 // Else fall through (handle "Y" constraint).
34350 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
34351 weight = CW_Register;
34354 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
34355 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
34356 weight = CW_Register;
34359 // Enable conditional vector operations using %k<#> registers.
34360 weight = CW_SpecificReg;
34363 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
34364 if (C->getZExtValue() <= 31)
34365 weight = CW_Constant;
34369 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34370 if (C->getZExtValue() <= 63)
34371 weight = CW_Constant;
34375 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34376 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
34377 weight = CW_Constant;
34381 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34382 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
34383 weight = CW_Constant;
34387 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34388 if (C->getZExtValue() <= 3)
34389 weight = CW_Constant;
34393 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34394 if (C->getZExtValue() <= 0xff)
34395 weight = CW_Constant;
34400 if (isa<ConstantFP>(CallOperandVal)) {
34401 weight = CW_Constant;
34405 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34406 if ((C->getSExtValue() >= -0x80000000LL) &&
34407 (C->getSExtValue() <= 0x7fffffffLL))
34408 weight = CW_Constant;
34412 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
34413 if (C->getZExtValue() <= 0xffffffff)
34414 weight = CW_Constant;
34421 /// Try to replace an X constraint, which matches anything, with another that
34422 /// has more specific requirements based on the type of the corresponding
34424 const char *X86TargetLowering::
34425 LowerXConstraint(EVT ConstraintVT) const {
34426 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
34427 // 'f' like normal targets.
34428 if (ConstraintVT.isFloatingPoint()) {
34429 if (Subtarget.hasSSE2())
34431 if (Subtarget.hasSSE1())
34435 return TargetLowering::LowerXConstraint(ConstraintVT);
34438 /// Lower the specified operand into the Ops vector.
34439 /// If it is invalid, don't add anything to Ops.
34440 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
34441 std::string &Constraint,
34442 std::vector<SDValue>&Ops,
34443 SelectionDAG &DAG) const {
34446 // Only support length 1 constraints for now.
34447 if (Constraint.length() > 1) return;
34449 char ConstraintLetter = Constraint[0];
34450 switch (ConstraintLetter) {
34453 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34454 if (C->getZExtValue() <= 31) {
34455 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34456 Op.getValueType());
34462 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34463 if (C->getZExtValue() <= 63) {
34464 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34465 Op.getValueType());
34471 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34472 if (isInt<8>(C->getSExtValue())) {
34473 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34474 Op.getValueType());
34480 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34481 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
34482 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
34483 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
34484 Op.getValueType());
34490 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34491 if (C->getZExtValue() <= 3) {
34492 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34493 Op.getValueType());
34499 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34500 if (C->getZExtValue() <= 255) {
34501 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34502 Op.getValueType());
34508 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34509 if (C->getZExtValue() <= 127) {
34510 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34511 Op.getValueType());
34517 // 32-bit signed value
34518 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34519 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34520 C->getSExtValue())) {
34521 // Widen to 64 bits here to get it sign extended.
34522 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
34525 // FIXME gcc accepts some relocatable values here too, but only in certain
34526 // memory models; it's complicated.
34531 // 32-bit unsigned value
34532 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
34533 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
34534 C->getZExtValue())) {
34535 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
34536 Op.getValueType());
34540 // FIXME gcc accepts some relocatable values here too, but only in certain
34541 // memory models; it's complicated.
34545 // Literal immediates are always ok.
34546 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
34547 // Widen to 64 bits here to get it sign extended.
34548 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
34552 // In any sort of PIC mode addresses need to be computed at runtime by
34553 // adding in a register or some sort of table lookup. These can't
34554 // be used as immediates.
34555 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
34558 // If we are in non-pic codegen mode, we allow the address of a global (with
34559 // an optional displacement) to be used with 'i'.
34560 GlobalAddressSDNode *GA = nullptr;
34561 int64_t Offset = 0;
34563 // Match either (GA), (GA+C), (GA+C1+C2), etc.
34565 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
34566 Offset += GA->getOffset();
34568 } else if (Op.getOpcode() == ISD::ADD) {
34569 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34570 Offset += C->getZExtValue();
34571 Op = Op.getOperand(0);
34574 } else if (Op.getOpcode() == ISD::SUB) {
34575 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
34576 Offset += -C->getZExtValue();
34577 Op = Op.getOperand(0);
34582 // Otherwise, this isn't something we can handle, reject it.
34586 const GlobalValue *GV = GA->getGlobal();
34587 // If we require an extra load to get this address, as in PIC mode, we
34588 // can't accept it.
34589 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
34592 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
34593 GA->getValueType(0), Offset);
34598 if (Result.getNode()) {
34599 Ops.push_back(Result);
34602 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
34605 /// Check if \p RC is a general purpose register class.
34606 /// I.e., GR* or one of their variant.
34607 static bool isGRClass(const TargetRegisterClass &RC) {
34608 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
34609 RC.hasSuperClassEq(&X86::GR16RegClass) ||
34610 RC.hasSuperClassEq(&X86::GR32RegClass) ||
34611 RC.hasSuperClassEq(&X86::GR64RegClass) ||
34612 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
34615 /// Check if \p RC is a vector register class.
34616 /// I.e., FR* / VR* or one of their variant.
34617 static bool isFRClass(const TargetRegisterClass &RC) {
34618 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
34619 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
34620 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
34621 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
34622 RC.hasSuperClassEq(&X86::VR512RegClass);
34625 std::pair<unsigned, const TargetRegisterClass *>
34626 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
34627 StringRef Constraint,
34629 // First, see if this is a constraint that directly corresponds to an LLVM
34631 if (Constraint.size() == 1) {
34632 // GCC Constraint Letters
34633 switch (Constraint[0]) {
34635 // TODO: Slight differences here in allocation order and leaving
34636 // RIP in the class. Do they matter any more here than they do
34637 // in the normal allocation?
34639 if (Subtarget.hasAVX512()) {
34640 // Only supported in AVX512 or later.
34641 switch (VT.SimpleTy) {
34644 return std::make_pair(0U, &X86::VK32RegClass);
34646 return std::make_pair(0U, &X86::VK16RegClass);
34648 return std::make_pair(0U, &X86::VK8RegClass);
34650 return std::make_pair(0U, &X86::VK1RegClass);
34652 return std::make_pair(0U, &X86::VK64RegClass);
34656 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
34657 if (Subtarget.is64Bit()) {
34658 if (VT == MVT::i32 || VT == MVT::f32)
34659 return std::make_pair(0U, &X86::GR32RegClass);
34660 if (VT == MVT::i16)
34661 return std::make_pair(0U, &X86::GR16RegClass);
34662 if (VT == MVT::i8 || VT == MVT::i1)
34663 return std::make_pair(0U, &X86::GR8RegClass);
34664 if (VT == MVT::i64 || VT == MVT::f64)
34665 return std::make_pair(0U, &X86::GR64RegClass);
34668 // 32-bit fallthrough
34669 case 'Q': // Q_REGS
34670 if (VT == MVT::i32 || VT == MVT::f32)
34671 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
34672 if (VT == MVT::i16)
34673 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
34674 if (VT == MVT::i8 || VT == MVT::i1)
34675 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
34676 if (VT == MVT::i64)
34677 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
34679 case 'r': // GENERAL_REGS
34680 case 'l': // INDEX_REGS
34681 if (VT == MVT::i8 || VT == MVT::i1)
34682 return std::make_pair(0U, &X86::GR8RegClass);
34683 if (VT == MVT::i16)
34684 return std::make_pair(0U, &X86::GR16RegClass);
34685 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
34686 return std::make_pair(0U, &X86::GR32RegClass);
34687 return std::make_pair(0U, &X86::GR64RegClass);
34688 case 'R': // LEGACY_REGS
34689 if (VT == MVT::i8 || VT == MVT::i1)
34690 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
34691 if (VT == MVT::i16)
34692 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
34693 if (VT == MVT::i32 || !Subtarget.is64Bit())
34694 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
34695 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
34696 case 'f': // FP Stack registers.
34697 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
34698 // value to the correct fpstack register class.
34699 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
34700 return std::make_pair(0U, &X86::RFP32RegClass);
34701 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
34702 return std::make_pair(0U, &X86::RFP64RegClass);
34703 return std::make_pair(0U, &X86::RFP80RegClass);
34704 case 'y': // MMX_REGS if MMX allowed.
34705 if (!Subtarget.hasMMX()) break;
34706 return std::make_pair(0U, &X86::VR64RegClass);
34707 case 'Y': // SSE_REGS if SSE2 allowed
34708 if (!Subtarget.hasSSE2()) break;
34711 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
34712 if (!Subtarget.hasSSE1()) break;
34713 bool VConstraint = (Constraint[0] == 'v');
34715 switch (VT.SimpleTy) {
34717 // Scalar SSE types.
34720 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
34721 return std::make_pair(0U, &X86::FR32XRegClass);
34722 return std::make_pair(0U, &X86::FR32RegClass);
34725 if (VConstraint && Subtarget.hasVLX())
34726 return std::make_pair(0U, &X86::FR64XRegClass);
34727 return std::make_pair(0U, &X86::FR64RegClass);
34728 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34736 if (VConstraint && Subtarget.hasVLX())
34737 return std::make_pair(0U, &X86::VR128XRegClass);
34738 return std::make_pair(0U, &X86::VR128RegClass);
34746 if (VConstraint && Subtarget.hasVLX())
34747 return std::make_pair(0U, &X86::VR256XRegClass);
34748 return std::make_pair(0U, &X86::VR256RegClass);
34753 return std::make_pair(0U, &X86::VR512RegClass);
34757 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
34758 switch (Constraint[1]) {
34762 // This register class doesn't allocate k0 for masked vector operation.
34763 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
34764 switch (VT.SimpleTy) {
34767 return std::make_pair(0U, &X86::VK32WMRegClass);
34769 return std::make_pair(0U, &X86::VK16WMRegClass);
34771 return std::make_pair(0U, &X86::VK8WMRegClass);
34773 return std::make_pair(0U, &X86::VK1WMRegClass);
34775 return std::make_pair(0U, &X86::VK64WMRegClass);
34782 // Use the default implementation in TargetLowering to convert the register
34783 // constraint into a member of a register class.
34784 std::pair<unsigned, const TargetRegisterClass*> Res;
34785 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
34787 // Not found as a standard register?
34789 // Map st(0) -> st(7) -> ST0
34790 if (Constraint.size() == 7 && Constraint[0] == '{' &&
34791 tolower(Constraint[1]) == 's' &&
34792 tolower(Constraint[2]) == 't' &&
34793 Constraint[3] == '(' &&
34794 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
34795 Constraint[5] == ')' &&
34796 Constraint[6] == '}') {
34798 Res.first = X86::FP0+Constraint[4]-'0';
34799 Res.second = &X86::RFP80RegClass;
34803 // GCC allows "st(0)" to be called just plain "st".
34804 if (StringRef("{st}").equals_lower(Constraint)) {
34805 Res.first = X86::FP0;
34806 Res.second = &X86::RFP80RegClass;
34811 if (StringRef("{flags}").equals_lower(Constraint)) {
34812 Res.first = X86::EFLAGS;
34813 Res.second = &X86::CCRRegClass;
34817 // 'A' means EAX + EDX.
34818 if (Constraint == "A") {
34819 Res.first = X86::EAX;
34820 Res.second = &X86::GR32_ADRegClass;
34826 // Otherwise, check to see if this is a register class of the wrong value
34827 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
34828 // turn into {ax},{dx}.
34829 // MVT::Other is used to specify clobber names.
34830 if (Res.second->hasType(VT) || VT == MVT::Other)
34831 return Res; // Correct type already, nothing to do.
34833 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
34834 // return "eax". This should even work for things like getting 64bit integer
34835 // registers when given an f64 type.
34836 const TargetRegisterClass *Class = Res.second;
34837 // The generic code will match the first register class that contains the
34838 // given register. Thus, based on the ordering of the tablegened file,
34839 // the "plain" GR classes might not come first.
34840 // Therefore, use a helper method.
34841 if (isGRClass(*Class)) {
34842 unsigned Size = VT.getSizeInBits();
34843 if (Size == 1) Size = 8;
34844 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
34846 Res.first = DestReg;
34847 Res.second = Size == 8 ? &X86::GR8RegClass
34848 : Size == 16 ? &X86::GR16RegClass
34849 : Size == 32 ? &X86::GR32RegClass
34850 : &X86::GR64RegClass;
34851 assert(Res.second->contains(Res.first) && "Register in register class");
34853 // No register found/type mismatch.
34855 Res.second = nullptr;
34857 } else if (isFRClass(*Class)) {
34858 // Handle references to XMM physical registers that got mapped into the
34859 // wrong class. This can happen with constraints like {xmm0} where the
34860 // target independent register mapper will just pick the first match it can
34861 // find, ignoring the required type.
34863 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
34864 if (VT == MVT::f32 || VT == MVT::i32)
34865 Res.second = &X86::FR32RegClass;
34866 else if (VT == MVT::f64 || VT == MVT::i64)
34867 Res.second = &X86::FR64RegClass;
34868 else if (X86::VR128RegClass.hasType(VT))
34869 Res.second = &X86::VR128RegClass;
34870 else if (X86::VR256RegClass.hasType(VT))
34871 Res.second = &X86::VR256RegClass;
34872 else if (X86::VR512RegClass.hasType(VT))
34873 Res.second = &X86::VR512RegClass;
34875 // Type mismatch and not a clobber: Return an error;
34877 Res.second = nullptr;
34884 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
34885 const AddrMode &AM, Type *Ty,
34886 unsigned AS) const {
34887 // Scaling factors are not free at all.
34888 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
34889 // will take 2 allocations in the out of order engine instead of 1
34890 // for plain addressing mode, i.e. inst (reg1).
34892 // vaddps (%rsi,%drx), %ymm0, %ymm1
34893 // Requires two allocations (one for the load, one for the computation)
34895 // vaddps (%rsi), %ymm0, %ymm1
34896 // Requires just 1 allocation, i.e., freeing allocations for other operations
34897 // and having less micro operations to execute.
34899 // For some X86 architectures, this is even worse because for instance for
34900 // stores, the complex addressing mode forces the instruction to use the
34901 // "load" ports instead of the dedicated "store" port.
34902 // E.g., on Haswell:
34903 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
34904 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
34905 if (isLegalAddressingMode(DL, AM, Ty, AS))
34906 // Scale represents reg2 * scale, thus account for 1
34907 // as soon as we use a second register.
34908 return AM.Scale != 0;
34912 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
34913 // Integer division on x86 is expensive. However, when aggressively optimizing
34914 // for code size, we prefer to use a div instruction, as it is usually smaller
34915 // than the alternative sequence.
34916 // The exception to this is vector division. Since x86 doesn't have vector
34917 // integer division, leaving the division as-is is a loss even in terms of
34918 // size, because it will have to be scalarized, while the alternative code
34919 // sequence can be performed in vector form.
34920 bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
34921 Attribute::MinSize);
34922 return OptSize && !VT.isVector();
34925 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
34926 if (!Subtarget.is64Bit())
34929 // Update IsSplitCSR in X86MachineFunctionInfo.
34930 X86MachineFunctionInfo *AFI =
34931 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
34932 AFI->setIsSplitCSR(true);
34935 void X86TargetLowering::insertCopiesSplitCSR(
34936 MachineBasicBlock *Entry,
34937 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
34938 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
34939 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
34943 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
34944 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
34945 MachineBasicBlock::iterator MBBI = Entry->begin();
34946 for (const MCPhysReg *I = IStart; *I; ++I) {
34947 const TargetRegisterClass *RC = nullptr;
34948 if (X86::GR64RegClass.contains(*I))
34949 RC = &X86::GR64RegClass;
34951 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
34953 unsigned NewVR = MRI->createVirtualRegister(RC);
34954 // Create copy from CSR to a virtual register.
34955 // FIXME: this currently does not emit CFI pseudo-instructions, it works
34956 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
34957 // nounwind. If we want to generalize this later, we may need to emit
34958 // CFI pseudo-instructions.
34959 assert(Entry->getParent()->getFunction()->hasFnAttribute(
34960 Attribute::NoUnwind) &&
34961 "Function should be nounwind in insertCopiesSplitCSR!");
34962 Entry->addLiveIn(*I);
34963 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
34966 // Insert the copy-back instructions right before the terminator.
34967 for (auto *Exit : Exits)
34968 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
34969 TII->get(TargetOpcode::COPY), *I)
34974 bool X86TargetLowering::supportSwiftError() const {
34975 return Subtarget.is64Bit();