1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetLowering.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
191 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
193 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
194 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
195 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
197 if (Subtarget.is64Bit()) {
198 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199 // f32/f64 are legal, f80 is custom.
200 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
204 } else if (!Subtarget.useSoftFloat()) {
205 // We have an algorithm for SSE2->double, and we turn this into a
206 // 64-bit FILD followed by conditional FADD for other targets.
207 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
208 // We have an algorithm for SSE2, and we turn this into a 64-bit
209 // FILD or VCVTUSI2SS/SD for other targets.
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
213 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
215 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
216 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
218 if (!Subtarget.useSoftFloat()) {
219 // SSE has no i16 to fp conversion, only i32.
220 if (X86ScalarSSEf32) {
221 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
222 // f32 and f64 cases are Legal, f80 case is not
223 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
225 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
233 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
235 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
238 if (!Subtarget.useSoftFloat()) {
239 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240 // are Legal, f80 is custom lowered.
241 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
242 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
244 if (X86ScalarSSEf32) {
245 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
246 // f32 and f64 cases are Legal, f80 case is not
247 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
249 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
250 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
255 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
258 // Handle FP_TO_UINT by promoting the destination to a larger signed
260 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
261 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
262 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
264 if (Subtarget.is64Bit()) {
265 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
267 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
268 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
273 } else if (!Subtarget.useSoftFloat()) {
274 // Since AVX is a superset of SSE3, only check for SSE here.
275 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276 // Expand FP_TO_UINT into a select.
277 // FIXME: We would like to use a Custom expander here eventually to do
278 // the optimal thing for SSE vs. the default expansion in the legalizer.
279 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
281 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282 // With SSE3 we can use fisttpll to convert to a signed i64; without
283 // SSE, we're stuck with a fistpll.
284 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
286 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
289 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290 if (!X86ScalarSSEf64) {
291 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
292 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
293 if (Subtarget.is64Bit()) {
294 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
295 // Without SSE, i64->f64 goes through memory.
296 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
298 } else if (!Subtarget.is64Bit())
299 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
301 // Scalar integer divide and remainder are lowered to use operations that
302 // produce two results, to match the available instructions. This exposes
303 // the two-result form to trivial CSE, which is able to combine x/y and x%y
304 // into a single instruction.
306 // Scalar integer multiply-high is also lowered to use two-result
307 // operations, to match the available instructions. However, plain multiply
308 // (low) operations are left as Legal, as there are single-result
309 // instructions for this in x86. Using the two-result multiply instructions
310 // when both high and low results are needed must be arranged by dagcombine.
311 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
312 setOperationAction(ISD::MULHS, VT, Expand);
313 setOperationAction(ISD::MULHU, VT, Expand);
314 setOperationAction(ISD::SDIV, VT, Expand);
315 setOperationAction(ISD::UDIV, VT, Expand);
316 setOperationAction(ISD::SREM, VT, Expand);
317 setOperationAction(ISD::UREM, VT, Expand);
320 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
321 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
322 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
323 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
324 setOperationAction(ISD::BR_CC, VT, Expand);
325 setOperationAction(ISD::SELECT_CC, VT, Expand);
327 if (Subtarget.is64Bit())
328 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
329 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
330 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
331 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
332 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
334 setOperationAction(ISD::FREM , MVT::f32 , Expand);
335 setOperationAction(ISD::FREM , MVT::f64 , Expand);
336 setOperationAction(ISD::FREM , MVT::f80 , Expand);
337 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
339 // Promote the i8 variants and force them on up to i32 which has a shorter
341 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
342 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 if (!Subtarget.hasBMI()) {
344 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
347 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
348 if (Subtarget.is64Bit()) {
349 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
354 if (Subtarget.hasLZCNT()) {
355 // When promoting the i8 variants, force them to i32 for a shorter
357 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
358 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
360 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
361 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
362 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
363 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
364 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
365 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
366 if (Subtarget.is64Bit()) {
367 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
372 // Special handling for half-precision floating point conversions.
373 // If we don't have F16C support, then lower half float conversions
374 // into library calls.
375 if (Subtarget.useSoftFloat() ||
376 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
377 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
378 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
381 // There's never any support for operations beyond MVT::f32.
382 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
383 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
384 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
387 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
388 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
389 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
390 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
391 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
392 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
394 if (Subtarget.hasPOPCNT()) {
395 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
397 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
398 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
399 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
400 if (Subtarget.is64Bit())
401 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
404 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
406 if (!Subtarget.hasMOVBE())
407 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
409 // These should be promoted to a larger select which is supported.
410 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
411 // X86 wants to expand cmov itself.
412 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
413 setOperationAction(ISD::SELECT, VT, Custom);
414 setOperationAction(ISD::SETCC, VT, Custom);
416 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417 if (VT == MVT::i64 && !Subtarget.is64Bit())
419 setOperationAction(ISD::SELECT, VT, Custom);
420 setOperationAction(ISD::SETCC, VT, Custom);
422 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
423 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
424 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
425 // support continuation, user-level threading, and etc.. As a result, no
426 // other SjLj exception interfaces are implemented and please don't build
427 // your own exception handling based on them.
428 // LLVM/Clang supports zero-cost DWARF exception handling.
429 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
430 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
431 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
432 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
433 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
436 for (auto VT : { MVT::i32, MVT::i64 }) {
437 if (VT == MVT::i64 && !Subtarget.is64Bit())
439 setOperationAction(ISD::ConstantPool , VT, Custom);
440 setOperationAction(ISD::JumpTable , VT, Custom);
441 setOperationAction(ISD::GlobalAddress , VT, Custom);
442 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
443 setOperationAction(ISD::ExternalSymbol , VT, Custom);
444 setOperationAction(ISD::BlockAddress , VT, Custom);
447 // 64-bit shl, sra, srl (iff 32-bit x86)
448 for (auto VT : { MVT::i32, MVT::i64 }) {
449 if (VT == MVT::i64 && !Subtarget.is64Bit())
451 setOperationAction(ISD::SHL_PARTS, VT, Custom);
452 setOperationAction(ISD::SRA_PARTS, VT, Custom);
453 setOperationAction(ISD::SRL_PARTS, VT, Custom);
456 if (Subtarget.hasSSE1())
457 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
459 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
461 // Expand certain atomics
462 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
463 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
464 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
465 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
466 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
467 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
468 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
469 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
472 if (Subtarget.hasCmpxchg16b()) {
473 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
476 // FIXME - use subtarget debug flags
477 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
478 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
479 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
480 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
483 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
484 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
486 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
487 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
489 setOperationAction(ISD::TRAP, MVT::Other, Legal);
490 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
492 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
493 setOperationAction(ISD::VASTART , MVT::Other, Custom);
494 setOperationAction(ISD::VAEND , MVT::Other, Expand);
495 bool Is64Bit = Subtarget.is64Bit();
496 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
497 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
499 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
500 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
502 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
504 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
505 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
506 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
508 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
509 // f32 and f64 use SSE.
510 // Set up the FP register classes.
511 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
512 : &X86::FR32RegClass);
513 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
514 : &X86::FR64RegClass);
516 for (auto VT : { MVT::f32, MVT::f64 }) {
517 // Use ANDPD to simulate FABS.
518 setOperationAction(ISD::FABS, VT, Custom);
520 // Use XORP to simulate FNEG.
521 setOperationAction(ISD::FNEG, VT, Custom);
523 // Use ANDPD and ORPD to simulate FCOPYSIGN.
524 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
526 // We don't support sin/cos/fmod
527 setOperationAction(ISD::FSIN , VT, Expand);
528 setOperationAction(ISD::FCOS , VT, Expand);
529 setOperationAction(ISD::FSINCOS, VT, Expand);
532 // Lower this to MOVMSK plus an AND.
533 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
534 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
536 // Expand FP immediates into loads from the stack, except for the special
538 addLegalFPImmediate(APFloat(+0.0)); // xorpd
539 addLegalFPImmediate(APFloat(+0.0f)); // xorps
540 } else if (UseX87 && X86ScalarSSEf32) {
541 // Use SSE for f32, x87 for f64.
542 // Set up the FP register classes.
543 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
544 : &X86::FR32RegClass);
545 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
547 // Use ANDPS to simulate FABS.
548 setOperationAction(ISD::FABS , MVT::f32, Custom);
550 // Use XORP to simulate FNEG.
551 setOperationAction(ISD::FNEG , MVT::f32, Custom);
553 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
555 // Use ANDPS and ORPS to simulate FCOPYSIGN.
556 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
557 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
559 // We don't support sin/cos/fmod
560 setOperationAction(ISD::FSIN , MVT::f32, Expand);
561 setOperationAction(ISD::FCOS , MVT::f32, Expand);
562 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
564 // Special cases we handle for FP constants.
565 addLegalFPImmediate(APFloat(+0.0f)); // xorps
566 addLegalFPImmediate(APFloat(+0.0)); // FLD0
567 addLegalFPImmediate(APFloat(+1.0)); // FLD1
568 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
569 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
571 if (!TM.Options.UnsafeFPMath) {
572 setOperationAction(ISD::FSIN , MVT::f64, Expand);
573 setOperationAction(ISD::FCOS , MVT::f64, Expand);
574 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
577 // f32 and f64 in x87.
578 // Set up the FP register classes.
579 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
580 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
582 for (auto VT : { MVT::f32, MVT::f64 }) {
583 setOperationAction(ISD::UNDEF, VT, Expand);
584 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
586 if (!TM.Options.UnsafeFPMath) {
587 setOperationAction(ISD::FSIN , VT, Expand);
588 setOperationAction(ISD::FCOS , VT, Expand);
589 setOperationAction(ISD::FSINCOS, VT, Expand);
592 addLegalFPImmediate(APFloat(+0.0)); // FLD0
593 addLegalFPImmediate(APFloat(+1.0)); // FLD1
594 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
595 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
596 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
597 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
598 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
599 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
602 // We don't support FMA.
603 setOperationAction(ISD::FMA, MVT::f64, Expand);
604 setOperationAction(ISD::FMA, MVT::f32, Expand);
606 // Long double always uses X87, except f128 in MMX.
608 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
609 addRegisterClass(MVT::f128, &X86::FR128RegClass);
610 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
611 setOperationAction(ISD::FABS , MVT::f128, Custom);
612 setOperationAction(ISD::FNEG , MVT::f128, Custom);
613 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
616 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
617 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
620 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
621 addLegalFPImmediate(TmpFlt); // FLD0
623 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
626 APFloat TmpFlt2(+1.0);
627 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
629 addLegalFPImmediate(TmpFlt2); // FLD1
630 TmpFlt2.changeSign();
631 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
634 if (!TM.Options.UnsafeFPMath) {
635 setOperationAction(ISD::FSIN , MVT::f80, Expand);
636 setOperationAction(ISD::FCOS , MVT::f80, Expand);
637 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
640 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
641 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
642 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
643 setOperationAction(ISD::FRINT, MVT::f80, Expand);
644 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
645 setOperationAction(ISD::FMA, MVT::f80, Expand);
648 // Always use a library call for pow.
649 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
650 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
651 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
653 setOperationAction(ISD::FLOG, MVT::f80, Expand);
654 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
655 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
656 setOperationAction(ISD::FEXP, MVT::f80, Expand);
657 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
658 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
659 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
661 // Some FP actions are always expanded for vector types.
662 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
663 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
664 setOperationAction(ISD::FSIN, VT, Expand);
665 setOperationAction(ISD::FSINCOS, VT, Expand);
666 setOperationAction(ISD::FCOS, VT, Expand);
667 setOperationAction(ISD::FREM, VT, Expand);
668 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
669 setOperationAction(ISD::FPOW, VT, Expand);
670 setOperationAction(ISD::FLOG, VT, Expand);
671 setOperationAction(ISD::FLOG2, VT, Expand);
672 setOperationAction(ISD::FLOG10, VT, Expand);
673 setOperationAction(ISD::FEXP, VT, Expand);
674 setOperationAction(ISD::FEXP2, VT, Expand);
677 // First set operation action for all vector types to either promote
678 // (for widening) or expand (for scalarization). Then we will selectively
679 // turn on ones that can be effectively codegen'd.
680 for (MVT VT : MVT::vector_valuetypes()) {
681 setOperationAction(ISD::SDIV, VT, Expand);
682 setOperationAction(ISD::UDIV, VT, Expand);
683 setOperationAction(ISD::SREM, VT, Expand);
684 setOperationAction(ISD::UREM, VT, Expand);
685 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
686 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
687 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
688 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
689 setOperationAction(ISD::FMA, VT, Expand);
690 setOperationAction(ISD::FFLOOR, VT, Expand);
691 setOperationAction(ISD::FCEIL, VT, Expand);
692 setOperationAction(ISD::FTRUNC, VT, Expand);
693 setOperationAction(ISD::FRINT, VT, Expand);
694 setOperationAction(ISD::FNEARBYINT, VT, Expand);
695 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
696 setOperationAction(ISD::MULHS, VT, Expand);
697 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
698 setOperationAction(ISD::MULHU, VT, Expand);
699 setOperationAction(ISD::SDIVREM, VT, Expand);
700 setOperationAction(ISD::UDIVREM, VT, Expand);
701 setOperationAction(ISD::CTPOP, VT, Expand);
702 setOperationAction(ISD::CTTZ, VT, Expand);
703 setOperationAction(ISD::CTLZ, VT, Expand);
704 setOperationAction(ISD::ROTL, VT, Expand);
705 setOperationAction(ISD::ROTR, VT, Expand);
706 setOperationAction(ISD::BSWAP, VT, Expand);
707 setOperationAction(ISD::SETCC, VT, Expand);
708 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
709 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
710 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
711 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
712 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
713 setOperationAction(ISD::TRUNCATE, VT, Expand);
714 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
715 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
716 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
717 setOperationAction(ISD::SELECT_CC, VT, Expand);
718 for (MVT InnerVT : MVT::vector_valuetypes()) {
719 setTruncStoreAction(InnerVT, VT, Expand);
721 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
722 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
724 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
725 // types, we have to deal with them whether we ask for Expansion or not.
726 // Setting Expand causes its own optimisation problems though, so leave
728 if (VT.getVectorElementType() == MVT::i1)
729 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
731 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
732 // split/scalarized right now.
733 if (VT.getVectorElementType() == MVT::f16)
734 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
738 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
739 // with -msoft-float, disable use of MMX as well.
740 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
741 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
742 // No operations on x86mmx supported, everything uses intrinsics.
745 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
746 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
747 : &X86::VR128RegClass);
749 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
750 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
751 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
752 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
753 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
754 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
755 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
756 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
757 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
760 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
761 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
762 : &X86::VR128RegClass);
764 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
765 // registers cannot be used even for integer operations.
766 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
767 : &X86::VR128RegClass);
768 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
769 : &X86::VR128RegClass);
770 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
771 : &X86::VR128RegClass);
772 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
773 : &X86::VR128RegClass);
775 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
776 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
777 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
778 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
779 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
780 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
781 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
782 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
783 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
784 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
785 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
786 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
787 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
789 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
790 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
791 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
792 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
794 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
795 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
796 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
798 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
799 setOperationAction(ISD::SETCC, VT, Custom);
800 setOperationAction(ISD::CTPOP, VT, Custom);
801 setOperationAction(ISD::CTTZ, VT, Custom);
804 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
805 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
806 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
807 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
808 setOperationAction(ISD::VSELECT, VT, Custom);
809 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
812 // We support custom legalizing of sext and anyext loads for specific
813 // memory vector types which we can load as a scalar (or sequence of
814 // scalars) and extend in-register to a legal 128-bit vector type. For sext
815 // loads these must work with a single scalar load.
816 for (MVT VT : MVT::integer_vector_valuetypes()) {
817 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
818 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
819 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
820 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
821 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
822 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
823 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
824 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
828 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
829 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
830 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
831 setOperationAction(ISD::VSELECT, VT, Custom);
833 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
836 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
837 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
840 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
841 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
842 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
843 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
844 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
845 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
846 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
849 // Custom lower v2i64 and v2f64 selects.
850 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
851 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
853 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
854 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
856 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
857 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
859 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
860 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
861 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
863 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
864 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
866 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
867 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
869 for (MVT VT : MVT::fp_vector_valuetypes())
870 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
872 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
873 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
874 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
876 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
877 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
878 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
880 // In the customized shift lowering, the legal v4i32/v2i64 cases
881 // in AVX2 will be recognized.
882 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
883 setOperationAction(ISD::SRL, VT, Custom);
884 setOperationAction(ISD::SHL, VT, Custom);
885 setOperationAction(ISD::SRA, VT, Custom);
889 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
890 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
891 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
892 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
893 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
894 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
895 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
896 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
897 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
900 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
901 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
902 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
903 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
904 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
905 setOperationAction(ISD::FRINT, RoundedTy, Legal);
906 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
909 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
910 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
911 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
912 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
913 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
914 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
915 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
916 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
918 // FIXME: Do we need to handle scalar-to-vector here?
919 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
921 // We directly match byte blends in the backend as they match the VSELECT
923 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
925 // SSE41 brings specific instructions for doing vector sign extend even in
926 // cases where we don't have SRA.
927 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
928 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
929 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
932 for (MVT VT : MVT::integer_vector_valuetypes()) {
933 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
934 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
935 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
938 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
939 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
940 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
941 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
942 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
943 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
944 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
945 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
948 // i8 vectors are custom because the source register and source
949 // source memory operand types are not the same width.
950 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
953 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
954 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
955 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
956 setOperationAction(ISD::ROTL, VT, Custom);
958 // XOP can efficiently perform BITREVERSE with VPPERM.
959 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
960 setOperationAction(ISD::BITREVERSE, VT, Custom);
962 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
963 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
964 setOperationAction(ISD::BITREVERSE, VT, Custom);
967 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
968 bool HasInt256 = Subtarget.hasInt256();
970 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
971 : &X86::VR256RegClass);
972 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
973 : &X86::VR256RegClass);
974 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
975 : &X86::VR256RegClass);
976 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
977 : &X86::VR256RegClass);
978 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
979 : &X86::VR256RegClass);
980 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
981 : &X86::VR256RegClass);
983 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
984 setOperationAction(ISD::FFLOOR, VT, Legal);
985 setOperationAction(ISD::FCEIL, VT, Legal);
986 setOperationAction(ISD::FTRUNC, VT, Legal);
987 setOperationAction(ISD::FRINT, VT, Legal);
988 setOperationAction(ISD::FNEARBYINT, VT, Legal);
989 setOperationAction(ISD::FNEG, VT, Custom);
990 setOperationAction(ISD::FABS, VT, Custom);
991 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
994 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
995 // even though v8i16 is a legal type.
996 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
997 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
998 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1000 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1001 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1002 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1004 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1005 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1007 for (MVT VT : MVT::fp_vector_valuetypes())
1008 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1010 // In the customized shift lowering, the legal v8i32/v4i64 cases
1011 // in AVX2 will be recognized.
1012 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1013 setOperationAction(ISD::SRL, VT, Custom);
1014 setOperationAction(ISD::SHL, VT, Custom);
1015 setOperationAction(ISD::SRA, VT, Custom);
1018 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1019 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1020 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1022 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1023 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1024 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1025 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1028 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1029 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1030 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1031 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1033 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1034 setOperationAction(ISD::SETCC, VT, Custom);
1035 setOperationAction(ISD::CTPOP, VT, Custom);
1036 setOperationAction(ISD::CTTZ, VT, Custom);
1037 setOperationAction(ISD::CTLZ, VT, Custom);
1040 if (Subtarget.hasAnyFMA()) {
1041 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1042 MVT::v2f64, MVT::v4f64 })
1043 setOperationAction(ISD::FMA, VT, Legal);
1046 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1047 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1048 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1051 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1052 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1054 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1056 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1057 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1059 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1060 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1061 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1062 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1064 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1065 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1067 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1068 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1069 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1073 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1074 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1075 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1077 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1078 // when we have a 256bit-wide blend with immediate.
1079 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1081 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1082 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1083 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1084 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1085 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1086 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1087 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1088 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1092 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1093 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1094 setOperationAction(ISD::MLOAD, VT, Legal);
1095 setOperationAction(ISD::MSTORE, VT, Legal);
1098 // Extract subvector is special because the value type
1099 // (result) is 128-bit but the source is 256-bit wide.
1100 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1101 MVT::v4f32, MVT::v2f64 }) {
1102 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1105 // Custom lower several nodes for 256-bit types.
1106 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1107 MVT::v8f32, MVT::v4f64 }) {
1108 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1109 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1110 setOperationAction(ISD::VSELECT, VT, Custom);
1111 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1112 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1113 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1114 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1115 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1119 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1121 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1122 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1123 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1124 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1125 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1126 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1127 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1131 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1132 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1133 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1134 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1135 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1137 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1138 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1139 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1141 for (MVT VT : MVT::fp_vector_valuetypes())
1142 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1144 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1145 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1146 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1147 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1148 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1149 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1150 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1153 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1154 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1155 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1156 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1157 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1158 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1159 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1160 setTruncStoreAction(VT, MaskVT, Custom);
1163 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1164 setOperationAction(ISD::FNEG, VT, Custom);
1165 setOperationAction(ISD::FABS, VT, Custom);
1166 setOperationAction(ISD::FMA, VT, Legal);
1167 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1170 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1171 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1172 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1173 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1174 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1175 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1176 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1177 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1178 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1179 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1180 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1181 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1182 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1183 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1184 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1185 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1187 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1189 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1190 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1191 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1192 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1193 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1194 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1196 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1197 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1198 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1199 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1200 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1201 if (Subtarget.hasVLX()){
1202 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1203 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1204 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1205 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1206 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1208 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1209 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1210 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1211 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1212 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1214 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1215 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1216 setOperationAction(ISD::MLOAD, VT, Custom);
1217 setOperationAction(ISD::MSTORE, VT, Custom);
1220 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1221 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1223 if (Subtarget.hasDQI()) {
1224 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1225 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1226 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1227 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1228 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1230 if (Subtarget.hasVLX()) {
1231 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1232 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1233 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1234 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1237 if (Subtarget.hasVLX()) {
1238 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1239 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1240 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1241 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1242 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1243 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1244 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1245 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1246 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1247 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1248 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1250 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1251 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1252 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1253 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1254 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1255 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1263 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1264 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1265 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1266 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1267 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1268 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1269 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1270 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1271 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1272 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1274 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1275 setOperationAction(ISD::FFLOOR, VT, Legal);
1276 setOperationAction(ISD::FCEIL, VT, Legal);
1277 setOperationAction(ISD::FTRUNC, VT, Legal);
1278 setOperationAction(ISD::FRINT, VT, Legal);
1279 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1282 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1283 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1285 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1286 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1287 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1289 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1290 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1291 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1292 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1293 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1295 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1297 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1298 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1299 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1300 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1301 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1302 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1304 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1306 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1307 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1308 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1310 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1311 setOperationAction(ISD::ADD, VT, Custom);
1312 setOperationAction(ISD::SUB, VT, Custom);
1313 setOperationAction(ISD::MUL, VT, Custom);
1314 setOperationAction(ISD::SETCC, VT, Custom);
1315 setOperationAction(ISD::SELECT, VT, Custom);
1316 setOperationAction(ISD::TRUNCATE, VT, Custom);
1318 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1319 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1320 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1321 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1322 setOperationAction(ISD::VSELECT, VT, Expand);
1325 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1326 setOperationAction(ISD::SMAX, VT, Legal);
1327 setOperationAction(ISD::UMAX, VT, Legal);
1328 setOperationAction(ISD::SMIN, VT, Legal);
1329 setOperationAction(ISD::UMIN, VT, Legal);
1330 setOperationAction(ISD::ABS, VT, Legal);
1331 setOperationAction(ISD::SRL, VT, Custom);
1332 setOperationAction(ISD::SHL, VT, Custom);
1333 setOperationAction(ISD::SRA, VT, Custom);
1334 setOperationAction(ISD::CTPOP, VT, Custom);
1335 setOperationAction(ISD::CTTZ, VT, Custom);
1338 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1339 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1341 setOperationAction(ISD::ROTL, VT, Custom);
1342 setOperationAction(ISD::ROTR, VT, Custom);
1345 // Need to promote to 64-bit even though we have 32-bit masked instructions
1346 // because the IR optimizers rearrange bitcasts around logic ops leaving
1347 // too many variations to handle if we don't promote them.
1348 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1349 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1350 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1352 if (Subtarget.hasCDI()) {
1353 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1354 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1355 MVT::v4i64, MVT::v8i64}) {
1356 setOperationAction(ISD::CTLZ, VT, Legal);
1357 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1359 } // Subtarget.hasCDI()
1361 if (Subtarget.hasDQI()) {
1362 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1363 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1364 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1365 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1368 if (Subtarget.hasVPOPCNTDQ()) {
1369 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1370 // version of popcntd/q.
1371 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1372 MVT::v4i32, MVT::v2i64})
1373 setOperationAction(ISD::CTPOP, VT, Legal);
1376 // Custom lower several nodes.
1377 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1378 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1379 setOperationAction(ISD::MGATHER, VT, Custom);
1380 setOperationAction(ISD::MSCATTER, VT, Custom);
1382 // Extract subvector is special because the value type
1383 // (result) is 256-bit but the source is 512-bit wide.
1384 // 128-bit was made Custom under AVX1.
1385 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1386 MVT::v8f32, MVT::v4f64 })
1387 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1388 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1389 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1390 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1392 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1393 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1394 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1395 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1396 setOperationAction(ISD::VSELECT, VT, Custom);
1397 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1398 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1399 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1400 setOperationAction(ISD::MLOAD, VT, Legal);
1401 setOperationAction(ISD::MSTORE, VT, Legal);
1402 setOperationAction(ISD::MGATHER, VT, Legal);
1403 setOperationAction(ISD::MSCATTER, VT, Custom);
1405 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1406 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1407 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1411 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1412 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1413 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1415 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1416 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1418 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1419 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1420 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1421 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1422 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1423 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1425 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1426 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1427 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1428 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1429 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1430 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1431 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1432 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1433 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1434 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1435 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1436 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1437 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1438 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1439 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1440 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1441 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1442 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1443 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1444 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1445 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1446 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1447 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1448 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1449 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1450 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1451 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1452 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1453 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1454 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1455 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1456 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1457 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1458 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1459 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1460 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1461 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1462 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1463 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1464 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1465 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1466 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1467 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1468 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1469 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1471 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1473 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1474 if (Subtarget.hasVLX()) {
1475 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1476 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1479 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1480 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1481 setOperationAction(ISD::MLOAD, VT, Action);
1482 setOperationAction(ISD::MSTORE, VT, Action);
1485 if (Subtarget.hasCDI()) {
1486 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1487 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1490 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1491 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1492 setOperationAction(ISD::VSELECT, VT, Custom);
1493 setOperationAction(ISD::ABS, VT, Legal);
1494 setOperationAction(ISD::SRL, VT, Custom);
1495 setOperationAction(ISD::SHL, VT, Custom);
1496 setOperationAction(ISD::SRA, VT, Custom);
1497 setOperationAction(ISD::MLOAD, VT, Legal);
1498 setOperationAction(ISD::MSTORE, VT, Legal);
1499 setOperationAction(ISD::CTPOP, VT, Custom);
1500 setOperationAction(ISD::CTTZ, VT, Custom);
1501 setOperationAction(ISD::SMAX, VT, Legal);
1502 setOperationAction(ISD::UMAX, VT, Legal);
1503 setOperationAction(ISD::SMIN, VT, Legal);
1504 setOperationAction(ISD::UMIN, VT, Legal);
1506 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1507 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1508 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1511 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1512 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1513 if (Subtarget.hasVLX()) {
1514 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1515 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1516 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1521 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1522 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1523 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1525 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1526 setOperationAction(ISD::ADD, VT, Custom);
1527 setOperationAction(ISD::SUB, VT, Custom);
1528 setOperationAction(ISD::MUL, VT, Custom);
1529 setOperationAction(ISD::VSELECT, VT, Expand);
1531 setOperationAction(ISD::TRUNCATE, VT, Custom);
1532 setOperationAction(ISD::SETCC, VT, Custom);
1533 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1534 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1535 setOperationAction(ISD::SELECT, VT, Custom);
1536 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1537 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1540 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1541 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1542 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1543 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1545 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1546 setOperationAction(ISD::SMAX, VT, Legal);
1547 setOperationAction(ISD::UMAX, VT, Legal);
1548 setOperationAction(ISD::SMIN, VT, Legal);
1549 setOperationAction(ISD::UMIN, VT, Legal);
1553 // We want to custom lower some of our intrinsics.
1554 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1555 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1556 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1557 if (!Subtarget.is64Bit()) {
1558 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1559 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1562 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1563 // handle type legalization for these operations here.
1565 // FIXME: We really should do custom legalization for addition and
1566 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1567 // than generic legalization for 64-bit multiplication-with-overflow, though.
1568 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1569 if (VT == MVT::i64 && !Subtarget.is64Bit())
1571 // Add/Sub/Mul with overflow operations are custom lowered.
1572 setOperationAction(ISD::SADDO, VT, Custom);
1573 setOperationAction(ISD::UADDO, VT, Custom);
1574 setOperationAction(ISD::SSUBO, VT, Custom);
1575 setOperationAction(ISD::USUBO, VT, Custom);
1576 setOperationAction(ISD::SMULO, VT, Custom);
1577 setOperationAction(ISD::UMULO, VT, Custom);
1579 // Support carry in as value rather than glue.
1580 setOperationAction(ISD::ADDCARRY, VT, Custom);
1581 setOperationAction(ISD::SUBCARRY, VT, Custom);
1582 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1585 if (!Subtarget.is64Bit()) {
1586 // These libcalls are not available in 32-bit.
1587 setLibcallName(RTLIB::SHL_I128, nullptr);
1588 setLibcallName(RTLIB::SRL_I128, nullptr);
1589 setLibcallName(RTLIB::SRA_I128, nullptr);
1592 // Combine sin / cos into one node or libcall if possible.
1593 if (Subtarget.hasSinCos()) {
1594 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1595 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1596 if (Subtarget.isTargetDarwin()) {
1597 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1598 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1599 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1600 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1604 if (Subtarget.isTargetWin64()) {
1605 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1606 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1607 setOperationAction(ISD::SREM, MVT::i128, Custom);
1608 setOperationAction(ISD::UREM, MVT::i128, Custom);
1609 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1610 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1613 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1614 // is. We should promote the value to 64-bits to solve this.
1615 // This is what the CRT headers do - `fmodf` is an inline header
1616 // function casting to f64 and calling `fmod`.
1617 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1618 Subtarget.isTargetWindowsItanium()))
1619 for (ISD::NodeType Op :
1620 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1621 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1622 if (isOperationExpand(Op, MVT::f32))
1623 setOperationAction(Op, MVT::f32, Promote);
1625 // We have target-specific dag combine patterns for the following nodes:
1626 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1627 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1628 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1629 setTargetDAGCombine(ISD::BITCAST);
1630 setTargetDAGCombine(ISD::VSELECT);
1631 setTargetDAGCombine(ISD::SELECT);
1632 setTargetDAGCombine(ISD::SHL);
1633 setTargetDAGCombine(ISD::SRA);
1634 setTargetDAGCombine(ISD::SRL);
1635 setTargetDAGCombine(ISD::OR);
1636 setTargetDAGCombine(ISD::AND);
1637 setTargetDAGCombine(ISD::ADD);
1638 setTargetDAGCombine(ISD::FADD);
1639 setTargetDAGCombine(ISD::FSUB);
1640 setTargetDAGCombine(ISD::FNEG);
1641 setTargetDAGCombine(ISD::FMA);
1642 setTargetDAGCombine(ISD::FMINNUM);
1643 setTargetDAGCombine(ISD::FMAXNUM);
1644 setTargetDAGCombine(ISD::SUB);
1645 setTargetDAGCombine(ISD::LOAD);
1646 setTargetDAGCombine(ISD::MLOAD);
1647 setTargetDAGCombine(ISD::STORE);
1648 setTargetDAGCombine(ISD::MSTORE);
1649 setTargetDAGCombine(ISD::TRUNCATE);
1650 setTargetDAGCombine(ISD::ZERO_EXTEND);
1651 setTargetDAGCombine(ISD::ANY_EXTEND);
1652 setTargetDAGCombine(ISD::SIGN_EXTEND);
1653 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1654 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1655 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1656 setTargetDAGCombine(ISD::SINT_TO_FP);
1657 setTargetDAGCombine(ISD::UINT_TO_FP);
1658 setTargetDAGCombine(ISD::SETCC);
1659 setTargetDAGCombine(ISD::MUL);
1660 setTargetDAGCombine(ISD::XOR);
1661 setTargetDAGCombine(ISD::MSCATTER);
1662 setTargetDAGCombine(ISD::MGATHER);
1664 computeRegisterProperties(Subtarget.getRegisterInfo());
1666 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1667 MaxStoresPerMemsetOptSize = 8;
1668 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1669 MaxStoresPerMemcpyOptSize = 4;
1670 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1671 MaxStoresPerMemmoveOptSize = 4;
1673 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1674 // that needs to benchmarked and balanced with the potential use of vector
1675 // load/store types (PR33329, PR33914).
1676 MaxLoadsPerMemcmp = 2;
1677 MaxLoadsPerMemcmpOptSize = 2;
1679 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1680 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1682 // An out-of-order CPU can speculatively execute past a predictable branch,
1683 // but a conditional move could be stalled by an expensive earlier operation.
1684 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1685 EnableExtLdPromotion = true;
1686 setPrefFunctionAlignment(4); // 2^4 bytes.
1688 verifyIntrinsicTables();
1691 // This has so far only been implemented for 64-bit MachO.
1692 bool X86TargetLowering::useLoadStackGuardNode() const {
1693 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1696 TargetLoweringBase::LegalizeTypeAction
1697 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1698 if (ExperimentalVectorWideningLegalization &&
1699 VT.getVectorNumElements() != 1 &&
1700 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1701 return TypeWidenVector;
1703 return TargetLoweringBase::getPreferredVectorAction(VT);
1706 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1707 LLVMContext& Context,
1712 if (VT.isSimple()) {
1713 MVT VVT = VT.getSimpleVT();
1714 const unsigned NumElts = VVT.getVectorNumElements();
1715 MVT EltVT = VVT.getVectorElementType();
1716 if (VVT.is512BitVector()) {
1717 if (Subtarget.hasAVX512())
1718 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1719 EltVT == MVT::f32 || EltVT == MVT::f64)
1721 case 8: return MVT::v8i1;
1722 case 16: return MVT::v16i1;
1724 if (Subtarget.hasBWI())
1725 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1727 case 32: return MVT::v32i1;
1728 case 64: return MVT::v64i1;
1732 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1733 return MVT::getVectorVT(MVT::i1, NumElts);
1735 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1736 EVT LegalVT = getTypeToTransformTo(Context, VT);
1737 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1740 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1742 case 2: return MVT::v2i1;
1743 case 4: return MVT::v4i1;
1744 case 8: return MVT::v8i1;
1748 return VT.changeVectorElementTypeToInteger();
1751 /// Helper for getByValTypeAlignment to determine
1752 /// the desired ByVal argument alignment.
1753 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1756 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1757 if (VTy->getBitWidth() == 128)
1759 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1760 unsigned EltAlign = 0;
1761 getMaxByValAlign(ATy->getElementType(), EltAlign);
1762 if (EltAlign > MaxAlign)
1763 MaxAlign = EltAlign;
1764 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1765 for (auto *EltTy : STy->elements()) {
1766 unsigned EltAlign = 0;
1767 getMaxByValAlign(EltTy, EltAlign);
1768 if (EltAlign > MaxAlign)
1769 MaxAlign = EltAlign;
1776 /// Return the desired alignment for ByVal aggregate
1777 /// function arguments in the caller parameter area. For X86, aggregates
1778 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1779 /// are at 4-byte boundaries.
1780 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1781 const DataLayout &DL) const {
1782 if (Subtarget.is64Bit()) {
1783 // Max of 8 and alignment of type.
1784 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1791 if (Subtarget.hasSSE1())
1792 getMaxByValAlign(Ty, Align);
1796 /// Returns the target specific optimal type for load
1797 /// and store operations as a result of memset, memcpy, and memmove
1798 /// lowering. If DstAlign is zero that means it's safe to destination
1799 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1800 /// means there isn't a need to check it against alignment requirement,
1801 /// probably because the source does not need to be loaded. If 'IsMemset' is
1802 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1803 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1804 /// source is constant so it does not need to be loaded.
1805 /// It returns EVT::Other if the type should be determined using generic
1806 /// target-independent logic.
1808 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1809 unsigned DstAlign, unsigned SrcAlign,
1810 bool IsMemset, bool ZeroMemset,
1812 MachineFunction &MF) const {
1813 const Function *F = MF.getFunction();
1814 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1816 (!Subtarget.isUnalignedMem16Slow() ||
1817 ((DstAlign == 0 || DstAlign >= 16) &&
1818 (SrcAlign == 0 || SrcAlign >= 16)))) {
1819 // FIXME: Check if unaligned 32-byte accesses are slow.
1820 if (Size >= 32 && Subtarget.hasAVX()) {
1821 // Although this isn't a well-supported type for AVX1, we'll let
1822 // legalization and shuffle lowering produce the optimal codegen. If we
1823 // choose an optimal type with a vector element larger than a byte,
1824 // getMemsetStores() may create an intermediate splat (using an integer
1825 // multiply) before we splat as a vector.
1828 if (Subtarget.hasSSE2())
1830 // TODO: Can SSE1 handle a byte vector?
1831 if (Subtarget.hasSSE1())
1833 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1834 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1835 // Do not use f64 to lower memcpy if source is string constant. It's
1836 // better to use i32 to avoid the loads.
1837 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1838 // The gymnastics of splatting a byte value into an XMM register and then
1839 // only using 8-byte stores (because this is a CPU with slow unaligned
1840 // 16-byte accesses) makes that a loser.
1844 // This is a compromise. If we reach here, unaligned accesses may be slow on
1845 // this target. However, creating smaller, aligned accesses could be even
1846 // slower and would certainly be a lot more code.
1847 if (Subtarget.is64Bit() && Size >= 8)
1852 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1854 return X86ScalarSSEf32;
1855 else if (VT == MVT::f64)
1856 return X86ScalarSSEf64;
1861 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1866 switch (VT.getSizeInBits()) {
1868 // 8-byte and under are always assumed to be fast.
1872 *Fast = !Subtarget.isUnalignedMem16Slow();
1875 *Fast = !Subtarget.isUnalignedMem32Slow();
1877 // TODO: What about AVX-512 (512-bit) accesses?
1880 // Misaligned accesses of any size are always allowed.
1884 /// Return the entry encoding for a jump table in the
1885 /// current function. The returned value is a member of the
1886 /// MachineJumpTableInfo::JTEntryKind enum.
1887 unsigned X86TargetLowering::getJumpTableEncoding() const {
1888 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1890 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1891 return MachineJumpTableInfo::EK_Custom32;
1893 // Otherwise, use the normal jump table encoding heuristics.
1894 return TargetLowering::getJumpTableEncoding();
1897 bool X86TargetLowering::useSoftFloat() const {
1898 return Subtarget.useSoftFloat();
1901 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1902 ArgListTy &Args) const {
1904 // Only relabel X86-32 for C / Stdcall CCs.
1905 if (Subtarget.is64Bit())
1907 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1909 unsigned ParamRegs = 0;
1910 if (auto *M = MF->getFunction()->getParent())
1911 ParamRegs = M->getNumberRegisterParameters();
1913 // Mark the first N int arguments as having reg
1914 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1915 Type *T = Args[Idx].Ty;
1916 if (T->isPointerTy() || T->isIntegerTy())
1917 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1918 unsigned numRegs = 1;
1919 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1921 if (ParamRegs < numRegs)
1923 ParamRegs -= numRegs;
1924 Args[Idx].IsInReg = true;
1930 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1931 const MachineBasicBlock *MBB,
1932 unsigned uid,MCContext &Ctx) const{
1933 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1934 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1936 return MCSymbolRefExpr::create(MBB->getSymbol(),
1937 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1940 /// Returns relocation base for the given PIC jumptable.
1941 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1942 SelectionDAG &DAG) const {
1943 if (!Subtarget.is64Bit())
1944 // This doesn't have SDLoc associated with it, but is not really the
1945 // same as a Register.
1946 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1947 getPointerTy(DAG.getDataLayout()));
1951 /// This returns the relocation base for the given PIC jumptable,
1952 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1953 const MCExpr *X86TargetLowering::
1954 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1955 MCContext &Ctx) const {
1956 // X86-64 uses RIP relative addressing based on the jump table label.
1957 if (Subtarget.isPICStyleRIPRel())
1958 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1960 // Otherwise, the reference is relative to the PIC base.
1961 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1964 std::pair<const TargetRegisterClass *, uint8_t>
1965 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1967 const TargetRegisterClass *RRC = nullptr;
1969 switch (VT.SimpleTy) {
1971 return TargetLowering::findRepresentativeClass(TRI, VT);
1972 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1973 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1976 RRC = &X86::VR64RegClass;
1978 case MVT::f32: case MVT::f64:
1979 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1980 case MVT::v4f32: case MVT::v2f64:
1981 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1982 case MVT::v8f32: case MVT::v4f64:
1983 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1984 case MVT::v16f32: case MVT::v8f64:
1985 RRC = &X86::VR128XRegClass;
1988 return std::make_pair(RRC, Cost);
1991 unsigned X86TargetLowering::getAddressSpace() const {
1992 if (Subtarget.is64Bit())
1993 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
1997 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
1998 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
1999 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2002 static Constant* SegmentOffset(IRBuilder<> &IRB,
2003 unsigned Offset, unsigned AddressSpace) {
2004 return ConstantExpr::getIntToPtr(
2005 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2006 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2009 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2010 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2011 // tcbhead_t; use it instead of the usual global variable (see
2012 // sysdeps/{i386,x86_64}/nptl/tls.h)
2013 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2014 if (Subtarget.isTargetFuchsia()) {
2015 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2016 return SegmentOffset(IRB, 0x10, getAddressSpace());
2018 // %fs:0x28, unless we're using a Kernel code model, in which case
2019 // it's %gs:0x28. gs:0x14 on i386.
2020 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2021 return SegmentOffset(IRB, Offset, getAddressSpace());
2025 return TargetLowering::getIRStackGuard(IRB);
2028 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2029 // MSVC CRT provides functionalities for stack protection.
2030 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2031 // MSVC CRT has a global variable holding security cookie.
2032 M.getOrInsertGlobal("__security_cookie",
2033 Type::getInt8PtrTy(M.getContext()));
2035 // MSVC CRT has a function to validate security cookie.
2036 auto *SecurityCheckCookie = cast<Function>(
2037 M.getOrInsertFunction("__security_check_cookie",
2038 Type::getVoidTy(M.getContext()),
2039 Type::getInt8PtrTy(M.getContext())));
2040 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2041 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2044 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2045 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2047 TargetLowering::insertSSPDeclarations(M);
2050 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2051 // MSVC CRT has a global variable holding security cookie.
2052 if (Subtarget.getTargetTriple().isOSMSVCRT())
2053 return M.getGlobalVariable("__security_cookie");
2054 return TargetLowering::getSDagStackGuard(M);
2057 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2058 // MSVC CRT has a function to validate security cookie.
2059 if (Subtarget.getTargetTriple().isOSMSVCRT())
2060 return M.getFunction("__security_check_cookie");
2061 return TargetLowering::getSSPStackGuardCheck(M);
2064 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2065 if (Subtarget.getTargetTriple().isOSContiki())
2066 return getDefaultSafeStackPointerLocation(IRB, false);
2068 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2069 // definition of TLS_SLOT_SAFESTACK in
2070 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2071 if (Subtarget.isTargetAndroid()) {
2072 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2074 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2075 return SegmentOffset(IRB, Offset, getAddressSpace());
2078 // Fuchsia is similar.
2079 if (Subtarget.isTargetFuchsia()) {
2080 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2081 return SegmentOffset(IRB, 0x18, getAddressSpace());
2084 return TargetLowering::getSafeStackPointerLocation(IRB);
2087 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2088 unsigned DestAS) const {
2089 assert(SrcAS != DestAS && "Expected different address spaces!");
2091 return SrcAS < 256 && DestAS < 256;
2094 //===----------------------------------------------------------------------===//
2095 // Return Value Calling Convention Implementation
2096 //===----------------------------------------------------------------------===//
2098 #include "X86GenCallingConv.inc"
2100 bool X86TargetLowering::CanLowerReturn(
2101 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2102 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2103 SmallVector<CCValAssign, 16> RVLocs;
2104 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2105 return CCInfo.CheckReturn(Outs, RetCC_X86);
2108 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2109 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2113 /// Lowers masks values (v*i1) to the local register values
2114 /// \returns DAG node after lowering to register type
2115 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2116 const SDLoc &Dl, SelectionDAG &DAG) {
2117 EVT ValVT = ValArg.getValueType();
2119 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2120 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2121 // Two stage lowering might be required
2122 // bitcast: v8i1 -> i8 / v16i1 -> i16
2123 // anyextend: i8 -> i32 / i16 -> i32
2124 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2125 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2126 if (ValLoc == MVT::i32)
2127 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2129 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2130 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2131 // One stage lowering is required
2132 // bitcast: v32i1 -> i32 / v64i1 -> i64
2133 return DAG.getBitcast(ValLoc, ValArg);
2135 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2138 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2139 static void Passv64i1ArgInRegs(
2140 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2141 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2142 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2143 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2144 "Expected AVX512BW or AVX512BMI target!");
2145 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2146 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2147 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2148 "The value should reside in two registers");
2150 // Before splitting the value we cast it to i64
2151 Arg = DAG.getBitcast(MVT::i64, Arg);
2153 // Splitting the value into two i32 types
2155 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2156 DAG.getConstant(0, Dl, MVT::i32));
2157 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2158 DAG.getConstant(1, Dl, MVT::i32));
2160 // Attach the two i32 types into corresponding registers
2161 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2162 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2166 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2168 const SmallVectorImpl<ISD::OutputArg> &Outs,
2169 const SmallVectorImpl<SDValue> &OutVals,
2170 const SDLoc &dl, SelectionDAG &DAG) const {
2171 MachineFunction &MF = DAG.getMachineFunction();
2172 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2174 // In some cases we need to disable registers from the default CSR list.
2175 // For example, when they are used for argument passing.
2176 bool ShouldDisableCalleeSavedRegister =
2177 CallConv == CallingConv::X86_RegCall ||
2178 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2180 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2181 report_fatal_error("X86 interrupts may not return any value");
2183 SmallVector<CCValAssign, 16> RVLocs;
2184 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2185 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2188 SmallVector<SDValue, 6> RetOps;
2189 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2190 // Operand #1 = Bytes To Pop
2191 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2194 // Copy the result values into the output registers.
2195 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2197 CCValAssign &VA = RVLocs[I];
2198 assert(VA.isRegLoc() && "Can only return in registers!");
2200 // Add the register to the CalleeSaveDisableRegs list.
2201 if (ShouldDisableCalleeSavedRegister)
2202 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2204 SDValue ValToCopy = OutVals[OutsIndex];
2205 EVT ValVT = ValToCopy.getValueType();
2207 // Promote values to the appropriate types.
2208 if (VA.getLocInfo() == CCValAssign::SExt)
2209 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2210 else if (VA.getLocInfo() == CCValAssign::ZExt)
2211 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2212 else if (VA.getLocInfo() == CCValAssign::AExt) {
2213 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2214 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2216 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2218 else if (VA.getLocInfo() == CCValAssign::BCvt)
2219 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2221 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2222 "Unexpected FP-extend for return value.");
2224 // If this is x86-64, and we disabled SSE, we can't return FP values,
2225 // or SSE or MMX vectors.
2226 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2227 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2228 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2229 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2230 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2231 } else if (ValVT == MVT::f64 &&
2232 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2233 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2234 // llvm-gcc has never done it right and no one has noticed, so this
2235 // should be OK for now.
2236 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2237 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2240 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2241 // the RET instruction and handled by the FP Stackifier.
2242 if (VA.getLocReg() == X86::FP0 ||
2243 VA.getLocReg() == X86::FP1) {
2244 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2245 // change the value to the FP stack register class.
2246 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2247 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2248 RetOps.push_back(ValToCopy);
2249 // Don't emit a copytoreg.
2253 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2254 // which is returned in RAX / RDX.
2255 if (Subtarget.is64Bit()) {
2256 if (ValVT == MVT::x86mmx) {
2257 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2258 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2259 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2261 // If we don't have SSE2 available, convert to v4f32 so the generated
2262 // register is legal.
2263 if (!Subtarget.hasSSE2())
2264 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2269 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2271 if (VA.needsCustom()) {
2272 assert(VA.getValVT() == MVT::v64i1 &&
2273 "Currently the only custom case is when we split v64i1 to 2 regs");
2275 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2278 assert(2 == RegsToPass.size() &&
2279 "Expecting two registers after Pass64BitArgInRegs");
2281 // Add the second register to the CalleeSaveDisableRegs list.
2282 if (ShouldDisableCalleeSavedRegister)
2283 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2285 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2288 // Add nodes to the DAG and add the values into the RetOps list
2289 for (auto &Reg : RegsToPass) {
2290 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2291 Flag = Chain.getValue(1);
2292 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2296 // Swift calling convention does not require we copy the sret argument
2297 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2299 // All x86 ABIs require that for returning structs by value we copy
2300 // the sret argument into %rax/%eax (depending on ABI) for the return.
2301 // We saved the argument into a virtual register in the entry block,
2302 // so now we copy the value out and into %rax/%eax.
2304 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2305 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2306 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2307 // either case FuncInfo->setSRetReturnReg() will have been called.
2308 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2309 // When we have both sret and another return value, we should use the
2310 // original Chain stored in RetOps[0], instead of the current Chain updated
2311 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2313 // For the case of sret and another return value, we have
2314 // Chain_0 at the function entry
2315 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2316 // If we use Chain_1 in getCopyFromReg, we will have
2317 // Val = getCopyFromReg(Chain_1)
2318 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2320 // getCopyToReg(Chain_0) will be glued together with
2321 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2322 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2323 // Data dependency from Unit B to Unit A due to usage of Val in
2324 // getCopyToReg(Chain_1, Val)
2325 // Chain dependency from Unit A to Unit B
2327 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2328 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2329 getPointerTy(MF.getDataLayout()));
2332 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2333 X86::RAX : X86::EAX;
2334 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2335 Flag = Chain.getValue(1);
2337 // RAX/EAX now acts like a return value.
2339 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2341 // Add the returned register to the CalleeSaveDisableRegs list.
2342 if (ShouldDisableCalleeSavedRegister)
2343 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2346 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2347 const MCPhysReg *I =
2348 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2351 if (X86::GR64RegClass.contains(*I))
2352 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2354 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2358 RetOps[0] = Chain; // Update chain.
2360 // Add the flag if we have it.
2362 RetOps.push_back(Flag);
2364 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2365 if (CallConv == CallingConv::X86_INTR)
2366 opcode = X86ISD::IRET;
2367 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2370 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2371 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2374 SDValue TCChain = Chain;
2375 SDNode *Copy = *N->use_begin();
2376 if (Copy->getOpcode() == ISD::CopyToReg) {
2377 // If the copy has a glue operand, we conservatively assume it isn't safe to
2378 // perform a tail call.
2379 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2381 TCChain = Copy->getOperand(0);
2382 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2385 bool HasRet = false;
2386 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2388 if (UI->getOpcode() != X86ISD::RET_FLAG)
2390 // If we are returning more than one value, we can definitely
2391 // not make a tail call see PR19530
2392 if (UI->getNumOperands() > 4)
2394 if (UI->getNumOperands() == 4 &&
2395 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2407 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2408 ISD::NodeType ExtendKind) const {
2409 MVT ReturnMVT = MVT::i32;
2411 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2412 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2413 // The ABI does not require i1, i8 or i16 to be extended.
2415 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2416 // always extending i8/i16 return values, so keep doing that for now.
2418 ReturnMVT = MVT::i8;
2421 EVT MinVT = getRegisterType(Context, ReturnMVT);
2422 return VT.bitsLT(MinVT) ? MinVT : VT;
2425 /// Reads two 32 bit registers and creates a 64 bit mask value.
2426 /// \param VA The current 32 bit value that need to be assigned.
2427 /// \param NextVA The next 32 bit value that need to be assigned.
2428 /// \param Root The parent DAG node.
2429 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2430 /// glue purposes. In the case the DAG is already using
2431 /// physical register instead of virtual, we should glue
2432 /// our new SDValue to InFlag SDvalue.
2433 /// \return a new SDvalue of size 64bit.
2434 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2435 SDValue &Root, SelectionDAG &DAG,
2436 const SDLoc &Dl, const X86Subtarget &Subtarget,
2437 SDValue *InFlag = nullptr) {
2438 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2439 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2440 assert(VA.getValVT() == MVT::v64i1 &&
2441 "Expecting first location of 64 bit width type");
2442 assert(NextVA.getValVT() == VA.getValVT() &&
2443 "The locations should have the same type");
2444 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2445 "The values should reside in two registers");
2449 SDValue ArgValueLo, ArgValueHi;
2451 MachineFunction &MF = DAG.getMachineFunction();
2452 const TargetRegisterClass *RC = &X86::GR32RegClass;
2454 // Read a 32 bit value from the registers
2455 if (nullptr == InFlag) {
2456 // When no physical register is present,
2457 // create an intermediate virtual register
2458 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2459 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2460 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2461 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2463 // When a physical register is available read the value from it and glue
2464 // the reads together.
2466 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2467 *InFlag = ArgValueLo.getValue(2);
2469 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2470 *InFlag = ArgValueHi.getValue(2);
2473 // Convert the i32 type into v32i1 type
2474 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2476 // Convert the i32 type into v32i1 type
2477 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2479 // Concatenate the two values together
2480 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2483 /// The function will lower a register of various sizes (8/16/32/64)
2484 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2485 /// \returns a DAG node contains the operand after lowering to mask type.
2486 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2487 const EVT &ValLoc, const SDLoc &Dl,
2488 SelectionDAG &DAG) {
2489 SDValue ValReturned = ValArg;
2491 if (ValVT == MVT::v1i1)
2492 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2494 if (ValVT == MVT::v64i1) {
2495 // In 32 bit machine, this case is handled by getv64i1Argument
2496 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2497 // In 64 bit machine, There is no need to truncate the value only bitcast
2500 switch (ValVT.getSimpleVT().SimpleTy) {
2511 llvm_unreachable("Expecting a vector of i1 types");
2514 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2516 return DAG.getBitcast(ValVT, ValReturned);
2519 /// Lower the result values of a call into the
2520 /// appropriate copies out of appropriate physical registers.
2522 SDValue X86TargetLowering::LowerCallResult(
2523 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2524 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2525 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2526 uint32_t *RegMask) const {
2528 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2529 // Assign locations to each value returned by this call.
2530 SmallVector<CCValAssign, 16> RVLocs;
2531 bool Is64Bit = Subtarget.is64Bit();
2532 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2534 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2536 // Copy all of the result registers out of their specified physreg.
2537 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2539 CCValAssign &VA = RVLocs[I];
2540 EVT CopyVT = VA.getLocVT();
2542 // In some calling conventions we need to remove the used registers
2543 // from the register mask.
2545 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2546 SubRegs.isValid(); ++SubRegs)
2547 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2550 // If this is x86-64, and we disabled SSE, we can't return FP values
2551 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2552 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2553 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2554 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2557 // If we prefer to use the value in xmm registers, copy it out as f80 and
2558 // use a truncate to move it from fp stack reg to xmm reg.
2559 bool RoundAfterCopy = false;
2560 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2561 isScalarFPTypeInSSEReg(VA.getValVT())) {
2562 if (!Subtarget.hasX87())
2563 report_fatal_error("X87 register return with X87 disabled");
2565 RoundAfterCopy = (CopyVT != VA.getLocVT());
2569 if (VA.needsCustom()) {
2570 assert(VA.getValVT() == MVT::v64i1 &&
2571 "Currently the only custom case is when we split v64i1 to 2 regs");
2573 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2575 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2577 Val = Chain.getValue(0);
2578 InFlag = Chain.getValue(2);
2582 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2583 // This truncation won't change the value.
2584 DAG.getIntPtrConstant(1, dl));
2586 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2587 if (VA.getValVT().isVector() &&
2588 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2589 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2590 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2591 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2593 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2596 InVals.push_back(Val);
2602 //===----------------------------------------------------------------------===//
2603 // C & StdCall & Fast Calling Convention implementation
2604 //===----------------------------------------------------------------------===//
2605 // StdCall calling convention seems to be standard for many Windows' API
2606 // routines and around. It differs from C calling convention just a little:
2607 // callee should clean up the stack, not caller. Symbols should be also
2608 // decorated in some fancy way :) It doesn't support any vector arguments.
2609 // For info on fast calling convention see Fast Calling Convention (tail call)
2610 // implementation LowerX86_32FastCCCallTo.
2612 /// CallIsStructReturn - Determines whether a call uses struct return
2614 enum StructReturnType {
2619 static StructReturnType
2620 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2622 return NotStructReturn;
2624 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2625 if (!Flags.isSRet())
2626 return NotStructReturn;
2627 if (Flags.isInReg() || IsMCU)
2628 return RegStructReturn;
2629 return StackStructReturn;
2632 /// Determines whether a function uses struct return semantics.
2633 static StructReturnType
2634 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2636 return NotStructReturn;
2638 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2639 if (!Flags.isSRet())
2640 return NotStructReturn;
2641 if (Flags.isInReg() || IsMCU)
2642 return RegStructReturn;
2643 return StackStructReturn;
2646 /// Make a copy of an aggregate at address specified by "Src" to address
2647 /// "Dst" with size and alignment information specified by the specific
2648 /// parameter attribute. The copy will be passed as a byval function parameter.
2649 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2650 SDValue Chain, ISD::ArgFlagsTy Flags,
2651 SelectionDAG &DAG, const SDLoc &dl) {
2652 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2654 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2655 /*isVolatile*/false, /*AlwaysInline=*/true,
2656 /*isTailCall*/false,
2657 MachinePointerInfo(), MachinePointerInfo());
2660 /// Return true if the calling convention is one that we can guarantee TCO for.
2661 static bool canGuaranteeTCO(CallingConv::ID CC) {
2662 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2663 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2664 CC == CallingConv::HHVM);
2667 /// Return true if we might ever do TCO for calls with this calling convention.
2668 static bool mayTailCallThisCC(CallingConv::ID CC) {
2670 // C calling conventions:
2671 case CallingConv::C:
2672 case CallingConv::Win64:
2673 case CallingConv::X86_64_SysV:
2674 // Callee pop conventions:
2675 case CallingConv::X86_ThisCall:
2676 case CallingConv::X86_StdCall:
2677 case CallingConv::X86_VectorCall:
2678 case CallingConv::X86_FastCall:
2681 return canGuaranteeTCO(CC);
2685 /// Return true if the function is being made into a tailcall target by
2686 /// changing its ABI.
2687 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2688 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2691 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2693 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2694 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2697 ImmutableCallSite CS(CI);
2698 CallingConv::ID CalleeCC = CS.getCallingConv();
2699 if (!mayTailCallThisCC(CalleeCC))
2706 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2707 const SmallVectorImpl<ISD::InputArg> &Ins,
2708 const SDLoc &dl, SelectionDAG &DAG,
2709 const CCValAssign &VA,
2710 MachineFrameInfo &MFI, unsigned i) const {
2711 // Create the nodes corresponding to a load from this parameter slot.
2712 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2713 bool AlwaysUseMutable = shouldGuaranteeTCO(
2714 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2715 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2717 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2719 // If value is passed by pointer we have address passed instead of the value
2720 // itself. No need to extend if the mask value and location share the same
2722 bool ExtendedInMem =
2723 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2724 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2726 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2727 ValVT = VA.getLocVT();
2729 ValVT = VA.getValVT();
2731 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2732 // taken by a return address.
2734 if (CallConv == CallingConv::X86_INTR) {
2735 // X86 interrupts may take one or two arguments.
2736 // On the stack there will be no return address as in regular call.
2737 // Offset of last argument need to be set to -4/-8 bytes.
2738 // Where offset of the first argument out of two, should be set to 0 bytes.
2739 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2740 if (Subtarget.is64Bit() && Ins.size() == 2) {
2741 // The stack pointer needs to be realigned for 64 bit handlers with error
2742 // code, so the argument offset changes by 8 bytes.
2747 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2748 // changed with more analysis.
2749 // In case of tail call optimization mark all arguments mutable. Since they
2750 // could be overwritten by lowering of arguments in case of a tail call.
2751 if (Flags.isByVal()) {
2752 unsigned Bytes = Flags.getByValSize();
2753 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2754 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2755 // Adjust SP offset of interrupt parameter.
2756 if (CallConv == CallingConv::X86_INTR) {
2757 MFI.setObjectOffset(FI, Offset);
2759 return DAG.getFrameIndex(FI, PtrVT);
2762 // This is an argument in memory. We might be able to perform copy elision.
2763 if (Flags.isCopyElisionCandidate()) {
2764 EVT ArgVT = Ins[i].ArgVT;
2766 if (Ins[i].PartOffset == 0) {
2767 // If this is a one-part value or the first part of a multi-part value,
2768 // create a stack object for the entire argument value type and return a
2769 // load from our portion of it. This assumes that if the first part of an
2770 // argument is in memory, the rest will also be in memory.
2771 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2772 /*Immutable=*/false);
2773 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2775 ValVT, dl, Chain, PartAddr,
2776 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2778 // This is not the first piece of an argument in memory. See if there is
2779 // already a fixed stack object including this offset. If so, assume it
2780 // was created by the PartOffset == 0 branch above and create a load from
2781 // the appropriate offset into it.
2782 int64_t PartBegin = VA.getLocMemOffset();
2783 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2784 int FI = MFI.getObjectIndexBegin();
2785 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2786 int64_t ObjBegin = MFI.getObjectOffset(FI);
2787 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2788 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2791 if (MFI.isFixedObjectIndex(FI)) {
2793 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2794 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2796 ValVT, dl, Chain, Addr,
2797 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2798 Ins[i].PartOffset));
2803 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2804 VA.getLocMemOffset(), isImmutable);
2806 // Set SExt or ZExt flag.
2807 if (VA.getLocInfo() == CCValAssign::ZExt) {
2808 MFI.setObjectZExt(FI, true);
2809 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2810 MFI.setObjectSExt(FI, true);
2813 // Adjust SP offset of interrupt parameter.
2814 if (CallConv == CallingConv::X86_INTR) {
2815 MFI.setObjectOffset(FI, Offset);
2818 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2819 SDValue Val = DAG.getLoad(
2820 ValVT, dl, Chain, FIN,
2821 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2822 return ExtendedInMem
2823 ? (VA.getValVT().isVector()
2824 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2825 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2829 // FIXME: Get this from tablegen.
2830 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2831 const X86Subtarget &Subtarget) {
2832 assert(Subtarget.is64Bit());
2834 if (Subtarget.isCallingConvWin64(CallConv)) {
2835 static const MCPhysReg GPR64ArgRegsWin64[] = {
2836 X86::RCX, X86::RDX, X86::R8, X86::R9
2838 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2841 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2842 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2844 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2847 // FIXME: Get this from tablegen.
2848 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2849 CallingConv::ID CallConv,
2850 const X86Subtarget &Subtarget) {
2851 assert(Subtarget.is64Bit());
2852 if (Subtarget.isCallingConvWin64(CallConv)) {
2853 // The XMM registers which might contain var arg parameters are shadowed
2854 // in their paired GPR. So we only need to save the GPR to their home
2856 // TODO: __vectorcall will change this.
2860 const Function *Fn = MF.getFunction();
2861 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2862 bool isSoftFloat = Subtarget.useSoftFloat();
2863 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2864 "SSE register cannot be used when SSE is disabled!");
2865 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2866 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2870 static const MCPhysReg XMMArgRegs64Bit[] = {
2871 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2872 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2874 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2878 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2879 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2880 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2881 return A.getValNo() < B.getValNo();
2886 SDValue X86TargetLowering::LowerFormalArguments(
2887 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2888 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2889 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2890 MachineFunction &MF = DAG.getMachineFunction();
2891 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2892 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2894 const Function *Fn = MF.getFunction();
2895 if (Fn->hasExternalLinkage() &&
2896 Subtarget.isTargetCygMing() &&
2897 Fn->getName() == "main")
2898 FuncInfo->setForceFramePointer(true);
2900 MachineFrameInfo &MFI = MF.getFrameInfo();
2901 bool Is64Bit = Subtarget.is64Bit();
2902 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2905 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2906 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2908 if (CallConv == CallingConv::X86_INTR) {
2909 bool isLegal = Ins.size() == 1 ||
2910 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2911 (!Is64Bit && Ins[1].VT == MVT::i32)));
2913 report_fatal_error("X86 interrupts may take one or two arguments");
2916 // Assign locations to all of the incoming arguments.
2917 SmallVector<CCValAssign, 16> ArgLocs;
2918 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2920 // Allocate shadow area for Win64.
2922 CCInfo.AllocateStack(32, 8);
2924 CCInfo.AnalyzeArguments(Ins, CC_X86);
2926 // In vectorcall calling convention a second pass is required for the HVA
2928 if (CallingConv::X86_VectorCall == CallConv) {
2929 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2932 // The next loop assumes that the locations are in the same order of the
2934 assert(isSortedByValueNo(ArgLocs) &&
2935 "Argument Location list must be sorted before lowering");
2938 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2940 assert(InsIndex < Ins.size() && "Invalid Ins index");
2941 CCValAssign &VA = ArgLocs[I];
2943 if (VA.isRegLoc()) {
2944 EVT RegVT = VA.getLocVT();
2945 if (VA.needsCustom()) {
2947 VA.getValVT() == MVT::v64i1 &&
2948 "Currently the only custom case is when we split v64i1 to 2 regs");
2950 // v64i1 values, in regcall calling convention, that are
2951 // compiled to 32 bit arch, are split up into two registers.
2953 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2955 const TargetRegisterClass *RC;
2956 if (RegVT == MVT::i32)
2957 RC = &X86::GR32RegClass;
2958 else if (Is64Bit && RegVT == MVT::i64)
2959 RC = &X86::GR64RegClass;
2960 else if (RegVT == MVT::f32)
2961 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2962 else if (RegVT == MVT::f64)
2963 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2964 else if (RegVT == MVT::f80)
2965 RC = &X86::RFP80RegClass;
2966 else if (RegVT == MVT::f128)
2967 RC = &X86::FR128RegClass;
2968 else if (RegVT.is512BitVector())
2969 RC = &X86::VR512RegClass;
2970 else if (RegVT.is256BitVector())
2971 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2972 else if (RegVT.is128BitVector())
2973 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2974 else if (RegVT == MVT::x86mmx)
2975 RC = &X86::VR64RegClass;
2976 else if (RegVT == MVT::v1i1)
2977 RC = &X86::VK1RegClass;
2978 else if (RegVT == MVT::v8i1)
2979 RC = &X86::VK8RegClass;
2980 else if (RegVT == MVT::v16i1)
2981 RC = &X86::VK16RegClass;
2982 else if (RegVT == MVT::v32i1)
2983 RC = &X86::VK32RegClass;
2984 else if (RegVT == MVT::v64i1)
2985 RC = &X86::VK64RegClass;
2987 llvm_unreachable("Unknown argument type!");
2989 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2990 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2993 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2994 // bits. Insert an assert[sz]ext to capture this, then truncate to the
2996 if (VA.getLocInfo() == CCValAssign::SExt)
2997 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
2998 DAG.getValueType(VA.getValVT()));
2999 else if (VA.getLocInfo() == CCValAssign::ZExt)
3000 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3001 DAG.getValueType(VA.getValVT()));
3002 else if (VA.getLocInfo() == CCValAssign::BCvt)
3003 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3005 if (VA.isExtInLoc()) {
3006 // Handle MMX values passed in XMM regs.
3007 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3008 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3009 else if (VA.getValVT().isVector() &&
3010 VA.getValVT().getScalarType() == MVT::i1 &&
3011 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3012 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3013 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3014 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3016 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3019 assert(VA.isMemLoc());
3021 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3024 // If value is passed via pointer - do a load.
3025 if (VA.getLocInfo() == CCValAssign::Indirect)
3027 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3029 InVals.push_back(ArgValue);
3032 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3033 // Swift calling convention does not require we copy the sret argument
3034 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3035 if (CallConv == CallingConv::Swift)
3038 // All x86 ABIs require that for returning structs by value we copy the
3039 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3040 // the argument into a virtual register so that we can access it from the
3042 if (Ins[I].Flags.isSRet()) {
3043 unsigned Reg = FuncInfo->getSRetReturnReg();
3045 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3046 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3047 FuncInfo->setSRetReturnReg(Reg);
3049 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3050 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3055 unsigned StackSize = CCInfo.getNextStackOffset();
3056 // Align stack specially for tail calls.
3057 if (shouldGuaranteeTCO(CallConv,
3058 MF.getTarget().Options.GuaranteedTailCallOpt))
3059 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3061 // If the function takes variable number of arguments, make a frame index for
3062 // the start of the first vararg value... for expansion of llvm.va_start. We
3063 // can skip this if there are no va_start calls.
3064 if (MFI.hasVAStart() &&
3065 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3066 CallConv != CallingConv::X86_ThisCall))) {
3067 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3070 // Figure out if XMM registers are in use.
3071 assert(!(Subtarget.useSoftFloat() &&
3072 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3073 "SSE register cannot be used when SSE is disabled!");
3075 // 64-bit calling conventions support varargs and register parameters, so we
3076 // have to do extra work to spill them in the prologue.
3077 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3078 // Find the first unallocated argument registers.
3079 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3080 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3081 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3082 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3083 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3084 "SSE register cannot be used when SSE is disabled!");
3086 // Gather all the live in physical registers.
3087 SmallVector<SDValue, 6> LiveGPRs;
3088 SmallVector<SDValue, 8> LiveXMMRegs;
3090 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3091 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3093 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3095 if (!ArgXMMs.empty()) {
3096 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3097 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3098 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3099 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3100 LiveXMMRegs.push_back(
3101 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3106 // Get to the caller-allocated home save location. Add 8 to account
3107 // for the return address.
3108 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3109 FuncInfo->setRegSaveFrameIndex(
3110 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3111 // Fixup to set vararg frame on shadow area (4 x i64).
3113 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3115 // For X86-64, if there are vararg parameters that are passed via
3116 // registers, then we must store them to their spots on the stack so
3117 // they may be loaded by dereferencing the result of va_next.
3118 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3119 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3120 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3121 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3124 // Store the integer parameter registers.
3125 SmallVector<SDValue, 8> MemOps;
3126 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3127 getPointerTy(DAG.getDataLayout()));
3128 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3129 for (SDValue Val : LiveGPRs) {
3130 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3131 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3133 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3134 MachinePointerInfo::getFixedStack(
3135 DAG.getMachineFunction(),
3136 FuncInfo->getRegSaveFrameIndex(), Offset));
3137 MemOps.push_back(Store);
3141 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3142 // Now store the XMM (fp + vector) parameter registers.
3143 SmallVector<SDValue, 12> SaveXMMOps;
3144 SaveXMMOps.push_back(Chain);
3145 SaveXMMOps.push_back(ALVal);
3146 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3147 FuncInfo->getRegSaveFrameIndex(), dl));
3148 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3149 FuncInfo->getVarArgsFPOffset(), dl));
3150 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3152 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3153 MVT::Other, SaveXMMOps));
3156 if (!MemOps.empty())
3157 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3160 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3161 // Find the largest legal vector type.
3162 MVT VecVT = MVT::Other;
3163 // FIXME: Only some x86_32 calling conventions support AVX512.
3164 if (Subtarget.hasAVX512() &&
3165 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3166 CallConv == CallingConv::Intel_OCL_BI)))
3167 VecVT = MVT::v16f32;
3168 else if (Subtarget.hasAVX())
3170 else if (Subtarget.hasSSE2())
3173 // We forward some GPRs and some vector types.
3174 SmallVector<MVT, 2> RegParmTypes;
3175 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3176 RegParmTypes.push_back(IntVT);
3177 if (VecVT != MVT::Other)
3178 RegParmTypes.push_back(VecVT);
3180 // Compute the set of forwarded registers. The rest are scratch.
3181 SmallVectorImpl<ForwardedRegister> &Forwards =
3182 FuncInfo->getForwardedMustTailRegParms();
3183 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3185 // Conservatively forward AL on x86_64, since it might be used for varargs.
3186 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3187 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3188 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3191 // Copy all forwards from physical to virtual registers.
3192 for (ForwardedRegister &F : Forwards) {
3193 // FIXME: Can we use a less constrained schedule?
3194 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3195 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3196 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3200 // Some CCs need callee pop.
3201 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3202 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3203 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3204 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3205 // X86 interrupts must pop the error code (and the alignment padding) if
3207 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3209 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3210 // If this is an sret function, the return should pop the hidden pointer.
3211 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3212 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3213 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3214 FuncInfo->setBytesToPopOnReturn(4);
3218 // RegSaveFrameIndex is X86-64 only.
3219 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3220 if (CallConv == CallingConv::X86_FastCall ||
3221 CallConv == CallingConv::X86_ThisCall)
3222 // fastcc functions can't have varargs.
3223 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3226 FuncInfo->setArgumentStackSize(StackSize);
3228 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3229 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3230 if (Personality == EHPersonality::CoreCLR) {
3232 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3233 // that we'd prefer this slot be allocated towards the bottom of the frame
3234 // (i.e. near the stack pointer after allocating the frame). Every
3235 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3236 // offset from the bottom of this and each funclet's frame must be the
3237 // same, so the size of funclets' (mostly empty) frames is dictated by
3238 // how far this slot is from the bottom (since they allocate just enough
3239 // space to accommodate holding this slot at the correct offset).
3240 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3241 EHInfo->PSPSymFrameIdx = PSPSymFI;
3245 if (CallConv == CallingConv::X86_RegCall ||
3246 Fn->hasFnAttribute("no_caller_saved_registers")) {
3247 const MachineRegisterInfo &MRI = MF.getRegInfo();
3248 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3249 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3255 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3256 SDValue Arg, const SDLoc &dl,
3258 const CCValAssign &VA,
3259 ISD::ArgFlagsTy Flags) const {
3260 unsigned LocMemOffset = VA.getLocMemOffset();
3261 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3262 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3264 if (Flags.isByVal())
3265 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3267 return DAG.getStore(
3268 Chain, dl, Arg, PtrOff,
3269 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3272 /// Emit a load of return address if tail call
3273 /// optimization is performed and it is required.
3274 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3275 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3276 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3277 // Adjust the Return address stack slot.
3278 EVT VT = getPointerTy(DAG.getDataLayout());
3279 OutRetAddr = getReturnAddressFrameIndex(DAG);
3281 // Load the "old" Return address.
3282 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3283 return SDValue(OutRetAddr.getNode(), 1);
3286 /// Emit a store of the return address if tail call
3287 /// optimization is performed and it is required (FPDiff!=0).
3288 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3289 SDValue Chain, SDValue RetAddrFrIdx,
3290 EVT PtrVT, unsigned SlotSize,
3291 int FPDiff, const SDLoc &dl) {
3292 // Store the return address to the appropriate stack slot.
3293 if (!FPDiff) return Chain;
3294 // Calculate the new stack slot for the return address.
3295 int NewReturnAddrFI =
3296 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3298 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3299 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3300 MachinePointerInfo::getFixedStack(
3301 DAG.getMachineFunction(), NewReturnAddrFI));
3305 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3306 /// operation of specified width.
3307 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3309 unsigned NumElems = VT.getVectorNumElements();
3310 SmallVector<int, 8> Mask;
3311 Mask.push_back(NumElems);
3312 for (unsigned i = 1; i != NumElems; ++i)
3314 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3318 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3319 SmallVectorImpl<SDValue> &InVals) const {
3320 SelectionDAG &DAG = CLI.DAG;
3322 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3323 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3324 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3325 SDValue Chain = CLI.Chain;
3326 SDValue Callee = CLI.Callee;
3327 CallingConv::ID CallConv = CLI.CallConv;
3328 bool &isTailCall = CLI.IsTailCall;
3329 bool isVarArg = CLI.IsVarArg;
3331 MachineFunction &MF = DAG.getMachineFunction();
3332 bool Is64Bit = Subtarget.is64Bit();
3333 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3334 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3335 bool IsSibcall = false;
3336 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3337 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3338 const CallInst *CI =
3339 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3340 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3341 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3342 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3344 if (CallConv == CallingConv::X86_INTR)
3345 report_fatal_error("X86 interrupts may not be called directly");
3347 if (Attr.getValueAsString() == "true")
3350 if (Subtarget.isPICStyleGOT() &&
3351 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3352 // If we are using a GOT, disable tail calls to external symbols with
3353 // default visibility. Tail calling such a symbol requires using a GOT
3354 // relocation, which forces early binding of the symbol. This breaks code
3355 // that require lazy function symbol resolution. Using musttail or
3356 // GuaranteedTailCallOpt will override this.
3357 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3358 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3359 G->getGlobal()->hasDefaultVisibility()))
3363 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3365 // Force this to be a tail call. The verifier rules are enough to ensure
3366 // that we can lower this successfully without moving the return address
3369 } else if (isTailCall) {
3370 // Check if it's really possible to do a tail call.
3371 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3372 isVarArg, SR != NotStructReturn,
3373 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3374 Outs, OutVals, Ins, DAG);
3376 // Sibcalls are automatically detected tailcalls which do not require
3378 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3385 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3386 "Var args not supported with calling convention fastcc, ghc or hipe");
3388 // Analyze operands of the call, assigning locations to each operand.
3389 SmallVector<CCValAssign, 16> ArgLocs;
3390 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3392 // Allocate shadow area for Win64.
3394 CCInfo.AllocateStack(32, 8);
3396 CCInfo.AnalyzeArguments(Outs, CC_X86);
3398 // In vectorcall calling convention a second pass is required for the HVA
3400 if (CallingConv::X86_VectorCall == CallConv) {
3401 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3404 // Get a count of how many bytes are to be pushed on the stack.
3405 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3407 // This is a sibcall. The memory operands are available in caller's
3408 // own caller's stack.
3410 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3411 canGuaranteeTCO(CallConv))
3412 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3415 if (isTailCall && !IsSibcall && !IsMustTail) {
3416 // Lower arguments at fp - stackoffset + fpdiff.
3417 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3419 FPDiff = NumBytesCallerPushed - NumBytes;
3421 // Set the delta of movement of the returnaddr stackslot.
3422 // But only set if delta is greater than previous delta.
3423 if (FPDiff < X86Info->getTCReturnAddrDelta())
3424 X86Info->setTCReturnAddrDelta(FPDiff);
3427 unsigned NumBytesToPush = NumBytes;
3428 unsigned NumBytesToPop = NumBytes;
3430 // If we have an inalloca argument, all stack space has already been allocated
3431 // for us and be right at the top of the stack. We don't support multiple
3432 // arguments passed in memory when using inalloca.
3433 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3435 if (!ArgLocs.back().isMemLoc())
3436 report_fatal_error("cannot use inalloca attribute on a register "
3438 if (ArgLocs.back().getLocMemOffset() != 0)
3439 report_fatal_error("any parameter with the inalloca attribute must be "
3440 "the only memory argument");
3444 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3445 NumBytes - NumBytesToPush, dl);
3447 SDValue RetAddrFrIdx;
3448 // Load return address for tail calls.
3449 if (isTailCall && FPDiff)
3450 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3451 Is64Bit, FPDiff, dl);
3453 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3454 SmallVector<SDValue, 8> MemOpChains;
3457 // The next loop assumes that the locations are in the same order of the
3459 assert(isSortedByValueNo(ArgLocs) &&
3460 "Argument Location list must be sorted before lowering");
3462 // Walk the register/memloc assignments, inserting copies/loads. In the case
3463 // of tail call optimization arguments are handle later.
3464 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3465 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3467 assert(OutIndex < Outs.size() && "Invalid Out index");
3468 // Skip inalloca arguments, they have already been written.
3469 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3470 if (Flags.isInAlloca())
3473 CCValAssign &VA = ArgLocs[I];
3474 EVT RegVT = VA.getLocVT();
3475 SDValue Arg = OutVals[OutIndex];
3476 bool isByVal = Flags.isByVal();
3478 // Promote the value if needed.
3479 switch (VA.getLocInfo()) {
3480 default: llvm_unreachable("Unknown loc info!");
3481 case CCValAssign::Full: break;
3482 case CCValAssign::SExt:
3483 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3485 case CCValAssign::ZExt:
3486 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3488 case CCValAssign::AExt:
3489 if (Arg.getValueType().isVector() &&
3490 Arg.getValueType().getVectorElementType() == MVT::i1)
3491 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3492 else if (RegVT.is128BitVector()) {
3493 // Special case: passing MMX values in XMM registers.
3494 Arg = DAG.getBitcast(MVT::i64, Arg);
3495 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3496 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3498 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3500 case CCValAssign::BCvt:
3501 Arg = DAG.getBitcast(RegVT, Arg);
3503 case CCValAssign::Indirect: {
3504 // Store the argument.
3505 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3506 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3507 Chain = DAG.getStore(
3508 Chain, dl, Arg, SpillSlot,
3509 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3515 if (VA.needsCustom()) {
3516 assert(VA.getValVT() == MVT::v64i1 &&
3517 "Currently the only custom case is when we split v64i1 to 2 regs");
3518 // Split v64i1 value into two registers
3519 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3521 } else if (VA.isRegLoc()) {
3522 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3523 if (isVarArg && IsWin64) {
3524 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3525 // shadow reg if callee is a varargs function.
3526 unsigned ShadowReg = 0;
3527 switch (VA.getLocReg()) {
3528 case X86::XMM0: ShadowReg = X86::RCX; break;
3529 case X86::XMM1: ShadowReg = X86::RDX; break;
3530 case X86::XMM2: ShadowReg = X86::R8; break;
3531 case X86::XMM3: ShadowReg = X86::R9; break;
3534 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3536 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3537 assert(VA.isMemLoc());
3538 if (!StackPtr.getNode())
3539 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3540 getPointerTy(DAG.getDataLayout()));
3541 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3542 dl, DAG, VA, Flags));
3546 if (!MemOpChains.empty())
3547 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3549 if (Subtarget.isPICStyleGOT()) {
3550 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3553 RegsToPass.push_back(std::make_pair(
3554 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3555 getPointerTy(DAG.getDataLayout()))));
3557 // If we are tail calling and generating PIC/GOT style code load the
3558 // address of the callee into ECX. The value in ecx is used as target of
3559 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3560 // for tail calls on PIC/GOT architectures. Normally we would just put the
3561 // address of GOT into ebx and then call target@PLT. But for tail calls
3562 // ebx would be restored (since ebx is callee saved) before jumping to the
3565 // Note: The actual moving to ECX is done further down.
3566 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3567 if (G && !G->getGlobal()->hasLocalLinkage() &&
3568 G->getGlobal()->hasDefaultVisibility())
3569 Callee = LowerGlobalAddress(Callee, DAG);
3570 else if (isa<ExternalSymbolSDNode>(Callee))
3571 Callee = LowerExternalSymbol(Callee, DAG);
3575 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3576 // From AMD64 ABI document:
3577 // For calls that may call functions that use varargs or stdargs
3578 // (prototype-less calls or calls to functions containing ellipsis (...) in
3579 // the declaration) %al is used as hidden argument to specify the number
3580 // of SSE registers used. The contents of %al do not need to match exactly
3581 // the number of registers, but must be an ubound on the number of SSE
3582 // registers used and is in the range 0 - 8 inclusive.
3584 // Count the number of XMM registers allocated.
3585 static const MCPhysReg XMMArgRegs[] = {
3586 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3587 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3589 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3590 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3591 && "SSE registers cannot be used when SSE is disabled");
3593 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3594 DAG.getConstant(NumXMMRegs, dl,
3598 if (isVarArg && IsMustTail) {
3599 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3600 for (const auto &F : Forwards) {
3601 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3602 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3606 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3607 // don't need this because the eligibility check rejects calls that require
3608 // shuffling arguments passed in memory.
3609 if (!IsSibcall && isTailCall) {
3610 // Force all the incoming stack arguments to be loaded from the stack
3611 // before any new outgoing arguments are stored to the stack, because the
3612 // outgoing stack slots may alias the incoming argument stack slots, and
3613 // the alias isn't otherwise explicit. This is slightly more conservative
3614 // than necessary, because it means that each store effectively depends
3615 // on every argument instead of just those arguments it would clobber.
3616 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3618 SmallVector<SDValue, 8> MemOpChains2;
3621 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3623 CCValAssign &VA = ArgLocs[I];
3625 if (VA.isRegLoc()) {
3626 if (VA.needsCustom()) {
3627 assert((CallConv == CallingConv::X86_RegCall) &&
3628 "Expecting custom case only in regcall calling convention");
3629 // This means that we are in special case where one argument was
3630 // passed through two register locations - Skip the next location
3637 assert(VA.isMemLoc());
3638 SDValue Arg = OutVals[OutsIndex];
3639 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3640 // Skip inalloca arguments. They don't require any work.
3641 if (Flags.isInAlloca())
3643 // Create frame index.
3644 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3645 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3646 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3647 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3649 if (Flags.isByVal()) {
3650 // Copy relative to framepointer.
3651 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3652 if (!StackPtr.getNode())
3653 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3654 getPointerTy(DAG.getDataLayout()));
3655 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3658 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3662 // Store relative to framepointer.
3663 MemOpChains2.push_back(DAG.getStore(
3664 ArgChain, dl, Arg, FIN,
3665 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3669 if (!MemOpChains2.empty())
3670 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3672 // Store the return address to the appropriate stack slot.
3673 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3674 getPointerTy(DAG.getDataLayout()),
3675 RegInfo->getSlotSize(), FPDiff, dl);
3678 // Build a sequence of copy-to-reg nodes chained together with token chain
3679 // and flag operands which copy the outgoing args into registers.
3681 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3682 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3683 RegsToPass[i].second, InFlag);
3684 InFlag = Chain.getValue(1);
3687 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3688 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3689 // In the 64-bit large code model, we have to make all calls
3690 // through a register, since the call instruction's 32-bit
3691 // pc-relative offset may not be large enough to hold the whole
3693 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3694 // If the callee is a GlobalAddress node (quite common, every direct call
3695 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3697 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3699 // We should use extra load for direct calls to dllimported functions in
3701 const GlobalValue *GV = G->getGlobal();
3702 if (!GV->hasDLLImportStorageClass()) {
3703 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3705 Callee = DAG.getTargetGlobalAddress(
3706 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3708 if (OpFlags == X86II::MO_GOTPCREL) {
3710 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3711 getPointerTy(DAG.getDataLayout()), Callee);
3712 // Add extra indirection
3713 Callee = DAG.getLoad(
3714 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3715 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3718 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3719 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3720 unsigned char OpFlags =
3721 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3723 Callee = DAG.getTargetExternalSymbol(
3724 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3725 } else if (Subtarget.isTarget64BitILP32() &&
3726 Callee->getValueType(0) == MVT::i32) {
3727 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3728 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3731 // Returns a chain & a flag for retval copy to use.
3732 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3733 SmallVector<SDValue, 8> Ops;
3735 if (!IsSibcall && isTailCall) {
3736 Chain = DAG.getCALLSEQ_END(Chain,
3737 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3738 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3739 InFlag = Chain.getValue(1);
3742 Ops.push_back(Chain);
3743 Ops.push_back(Callee);
3746 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3748 // Add argument registers to the end of the list so that they are known live
3750 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3751 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3752 RegsToPass[i].second.getValueType()));
3754 // Add a register mask operand representing the call-preserved registers.
3755 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3756 // set X86_INTR calling convention because it has the same CSR mask
3757 // (same preserved registers).
3758 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3759 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3760 assert(Mask && "Missing call preserved mask for calling convention");
3762 // If this is an invoke in a 32-bit function using a funclet-based
3763 // personality, assume the function clobbers all registers. If an exception
3764 // is thrown, the runtime will not restore CSRs.
3765 // FIXME: Model this more precisely so that we can register allocate across
3766 // the normal edge and spill and fill across the exceptional edge.
3767 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3768 const Function *CallerFn = MF.getFunction();
3769 EHPersonality Pers =
3770 CallerFn->hasPersonalityFn()
3771 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3772 : EHPersonality::Unknown;
3773 if (isFuncletEHPersonality(Pers))
3774 Mask = RegInfo->getNoPreservedMask();
3777 // Define a new register mask from the existing mask.
3778 uint32_t *RegMask = nullptr;
3780 // In some calling conventions we need to remove the used physical registers
3781 // from the reg mask.
3782 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3783 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3785 // Allocate a new Reg Mask and copy Mask.
3786 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3787 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3788 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3790 // Make sure all sub registers of the argument registers are reset
3792 for (auto const &RegPair : RegsToPass)
3793 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3794 SubRegs.isValid(); ++SubRegs)
3795 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3797 // Create the RegMask Operand according to our updated mask.
3798 Ops.push_back(DAG.getRegisterMask(RegMask));
3800 // Create the RegMask Operand according to the static mask.
3801 Ops.push_back(DAG.getRegisterMask(Mask));
3804 if (InFlag.getNode())
3805 Ops.push_back(InFlag);
3809 //// If this is the first return lowered for this function, add the regs
3810 //// to the liveout set for the function.
3811 // This isn't right, although it's probably harmless on x86; liveouts
3812 // should be computed from returns not tail calls. Consider a void
3813 // function making a tail call to a function returning int.
3814 MF.getFrameInfo().setHasTailCall();
3815 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3818 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3819 InFlag = Chain.getValue(1);
3821 // Create the CALLSEQ_END node.
3822 unsigned NumBytesForCalleeToPop;
3823 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3824 DAG.getTarget().Options.GuaranteedTailCallOpt))
3825 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3826 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3827 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3828 SR == StackStructReturn)
3829 // If this is a call to a struct-return function, the callee
3830 // pops the hidden struct pointer, so we have to push it back.
3831 // This is common for Darwin/X86, Linux & Mingw32 targets.
3832 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3833 NumBytesForCalleeToPop = 4;
3835 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3837 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3838 // No need to reset the stack after the call if the call doesn't return. To
3839 // make the MI verify, we'll pretend the callee does it for us.
3840 NumBytesForCalleeToPop = NumBytes;
3843 // Returns a flag for retval copy to use.
3845 Chain = DAG.getCALLSEQ_END(Chain,
3846 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3847 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3850 InFlag = Chain.getValue(1);
3853 // Handle result values, copying them out of physregs into vregs that we
3855 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3859 //===----------------------------------------------------------------------===//
3860 // Fast Calling Convention (tail call) implementation
3861 //===----------------------------------------------------------------------===//
3863 // Like std call, callee cleans arguments, convention except that ECX is
3864 // reserved for storing the tail called function address. Only 2 registers are
3865 // free for argument passing (inreg). Tail call optimization is performed
3867 // * tailcallopt is enabled
3868 // * caller/callee are fastcc
3869 // On X86_64 architecture with GOT-style position independent code only local
3870 // (within module) calls are supported at the moment.
3871 // To keep the stack aligned according to platform abi the function
3872 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3873 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3874 // If a tail called function callee has more arguments than the caller the
3875 // caller needs to make sure that there is room to move the RETADDR to. This is
3876 // achieved by reserving an area the size of the argument delta right after the
3877 // original RETADDR, but before the saved framepointer or the spilled registers
3878 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3890 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3893 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3894 SelectionDAG& DAG) const {
3895 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3896 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3897 unsigned StackAlignment = TFI.getStackAlignment();
3898 uint64_t AlignMask = StackAlignment - 1;
3899 int64_t Offset = StackSize;
3900 unsigned SlotSize = RegInfo->getSlotSize();
3901 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3902 // Number smaller than 12 so just add the difference.
3903 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3905 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3906 Offset = ((~AlignMask) & Offset) + StackAlignment +
3907 (StackAlignment-SlotSize);
3912 /// Return true if the given stack call argument is already available in the
3913 /// same position (relatively) of the caller's incoming argument stack.
3915 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3916 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3917 const X86InstrInfo *TII, const CCValAssign &VA) {
3918 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3921 // Look through nodes that don't alter the bits of the incoming value.
3922 unsigned Op = Arg.getOpcode();
3923 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3924 Arg = Arg.getOperand(0);
3927 if (Op == ISD::TRUNCATE) {
3928 const SDValue &TruncInput = Arg.getOperand(0);
3929 if (TruncInput.getOpcode() == ISD::AssertZext &&
3930 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3931 Arg.getValueType()) {
3932 Arg = TruncInput.getOperand(0);
3940 if (Arg.getOpcode() == ISD::CopyFromReg) {
3941 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3942 if (!TargetRegisterInfo::isVirtualRegister(VR))
3944 MachineInstr *Def = MRI->getVRegDef(VR);
3947 if (!Flags.isByVal()) {
3948 if (!TII->isLoadFromStackSlot(*Def, FI))
3951 unsigned Opcode = Def->getOpcode();
3952 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3953 Opcode == X86::LEA64_32r) &&
3954 Def->getOperand(1).isFI()) {
3955 FI = Def->getOperand(1).getIndex();
3956 Bytes = Flags.getByValSize();
3960 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3961 if (Flags.isByVal())
3962 // ByVal argument is passed in as a pointer but it's now being
3963 // dereferenced. e.g.
3964 // define @foo(%struct.X* %A) {
3965 // tail call @bar(%struct.X* byval %A)
3968 SDValue Ptr = Ld->getBasePtr();
3969 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3972 FI = FINode->getIndex();
3973 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3974 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3975 FI = FINode->getIndex();
3976 Bytes = Flags.getByValSize();
3980 assert(FI != INT_MAX);
3981 if (!MFI.isFixedObjectIndex(FI))
3984 if (Offset != MFI.getObjectOffset(FI))
3987 // If this is not byval, check that the argument stack object is immutable.
3988 // inalloca and argument copy elision can create mutable argument stack
3989 // objects. Byval objects can be mutated, but a byval call intends to pass the
3991 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3994 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
3995 // If the argument location is wider than the argument type, check that any
3996 // extension flags match.
3997 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
3998 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4003 return Bytes == MFI.getObjectSize(FI);
4006 /// Check whether the call is eligible for tail call optimization. Targets
4007 /// that want to do tail call optimization should implement this function.
4008 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4009 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4010 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4011 const SmallVectorImpl<ISD::OutputArg> &Outs,
4012 const SmallVectorImpl<SDValue> &OutVals,
4013 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4014 if (!mayTailCallThisCC(CalleeCC))
4017 // If -tailcallopt is specified, make fastcc functions tail-callable.
4018 MachineFunction &MF = DAG.getMachineFunction();
4019 const Function *CallerF = MF.getFunction();
4021 // If the function return type is x86_fp80 and the callee return type is not,
4022 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4023 // perform a tailcall optimization here.
4024 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4027 CallingConv::ID CallerCC = CallerF->getCallingConv();
4028 bool CCMatch = CallerCC == CalleeCC;
4029 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4030 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4032 // Win64 functions have extra shadow space for argument homing. Don't do the
4033 // sibcall if the caller and callee have mismatched expectations for this
4035 if (IsCalleeWin64 != IsCallerWin64)
4038 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4039 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4044 // Look for obvious safe cases to perform tail call optimization that do not
4045 // require ABI changes. This is what gcc calls sibcall.
4047 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4048 // emit a special epilogue.
4049 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4050 if (RegInfo->needsStackRealignment(MF))
4053 // Also avoid sibcall optimization if either caller or callee uses struct
4054 // return semantics.
4055 if (isCalleeStructRet || isCallerStructRet)
4058 // Do not sibcall optimize vararg calls unless all arguments are passed via
4060 LLVMContext &C = *DAG.getContext();
4061 if (isVarArg && !Outs.empty()) {
4062 // Optimizing for varargs on Win64 is unlikely to be safe without
4063 // additional testing.
4064 if (IsCalleeWin64 || IsCallerWin64)
4067 SmallVector<CCValAssign, 16> ArgLocs;
4068 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4070 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4071 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4072 if (!ArgLocs[i].isRegLoc())
4076 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4077 // stack. Therefore, if it's not used by the call it is not safe to optimize
4078 // this into a sibcall.
4079 bool Unused = false;
4080 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4087 SmallVector<CCValAssign, 16> RVLocs;
4088 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4089 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4090 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4091 CCValAssign &VA = RVLocs[i];
4092 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4097 // Check that the call results are passed in the same way.
4098 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4099 RetCC_X86, RetCC_X86))
4101 // The callee has to preserve all registers the caller needs to preserve.
4102 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4103 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4105 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4106 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4110 unsigned StackArgsSize = 0;
4112 // If the callee takes no arguments then go on to check the results of the
4114 if (!Outs.empty()) {
4115 // Check if stack adjustment is needed. For now, do not do this if any
4116 // argument is passed on the stack.
4117 SmallVector<CCValAssign, 16> ArgLocs;
4118 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4120 // Allocate shadow area for Win64
4122 CCInfo.AllocateStack(32, 8);
4124 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4125 StackArgsSize = CCInfo.getNextStackOffset();
4127 if (CCInfo.getNextStackOffset()) {
4128 // Check if the arguments are already laid out in the right way as
4129 // the caller's fixed stack objects.
4130 MachineFrameInfo &MFI = MF.getFrameInfo();
4131 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4132 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4133 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4134 CCValAssign &VA = ArgLocs[i];
4135 SDValue Arg = OutVals[i];
4136 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4137 if (VA.getLocInfo() == CCValAssign::Indirect)
4139 if (!VA.isRegLoc()) {
4140 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4147 bool PositionIndependent = isPositionIndependent();
4148 // If the tailcall address may be in a register, then make sure it's
4149 // possible to register allocate for it. In 32-bit, the call address can
4150 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4151 // callee-saved registers are restored. These happen to be the same
4152 // registers used to pass 'inreg' arguments so watch out for those.
4153 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4154 !isa<ExternalSymbolSDNode>(Callee)) ||
4155 PositionIndependent)) {
4156 unsigned NumInRegs = 0;
4157 // In PIC we need an extra register to formulate the address computation
4159 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4161 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4162 CCValAssign &VA = ArgLocs[i];
4165 unsigned Reg = VA.getLocReg();
4168 case X86::EAX: case X86::EDX: case X86::ECX:
4169 if (++NumInRegs == MaxInRegs)
4176 const MachineRegisterInfo &MRI = MF.getRegInfo();
4177 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4181 bool CalleeWillPop =
4182 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4183 MF.getTarget().Options.GuaranteedTailCallOpt);
4185 if (unsigned BytesToPop =
4186 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4187 // If we have bytes to pop, the callee must pop them.
4188 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4189 if (!CalleePopMatches)
4191 } else if (CalleeWillPop && StackArgsSize > 0) {
4192 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4200 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4201 const TargetLibraryInfo *libInfo) const {
4202 return X86::createFastISel(funcInfo, libInfo);
4205 //===----------------------------------------------------------------------===//
4206 // Other Lowering Hooks
4207 //===----------------------------------------------------------------------===//
4209 static bool MayFoldLoad(SDValue Op) {
4210 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4213 static bool MayFoldIntoStore(SDValue Op) {
4214 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4217 static bool MayFoldIntoZeroExtend(SDValue Op) {
4218 if (Op.hasOneUse()) {
4219 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4220 return (ISD::ZERO_EXTEND == Opcode);
4225 static bool isTargetShuffle(unsigned Opcode) {
4227 default: return false;
4228 case X86ISD::BLENDI:
4229 case X86ISD::PSHUFB:
4230 case X86ISD::PSHUFD:
4231 case X86ISD::PSHUFHW:
4232 case X86ISD::PSHUFLW:
4234 case X86ISD::INSERTPS:
4235 case X86ISD::EXTRQI:
4236 case X86ISD::INSERTQI:
4237 case X86ISD::PALIGNR:
4238 case X86ISD::VSHLDQ:
4239 case X86ISD::VSRLDQ:
4240 case X86ISD::MOVLHPS:
4241 case X86ISD::MOVLHPD:
4242 case X86ISD::MOVHLPS:
4243 case X86ISD::MOVLPS:
4244 case X86ISD::MOVLPD:
4245 case X86ISD::MOVSHDUP:
4246 case X86ISD::MOVSLDUP:
4247 case X86ISD::MOVDDUP:
4250 case X86ISD::UNPCKL:
4251 case X86ISD::UNPCKH:
4252 case X86ISD::VBROADCAST:
4253 case X86ISD::VPERMILPI:
4254 case X86ISD::VPERMILPV:
4255 case X86ISD::VPERM2X128:
4256 case X86ISD::VPERMIL2:
4257 case X86ISD::VPERMI:
4258 case X86ISD::VPPERM:
4259 case X86ISD::VPERMV:
4260 case X86ISD::VPERMV3:
4261 case X86ISD::VPERMIV3:
4262 case X86ISD::VZEXT_MOVL:
4267 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4269 default: return false;
4271 case X86ISD::PSHUFB:
4272 case X86ISD::VPERMILPV:
4273 case X86ISD::VPERMIL2:
4274 case X86ISD::VPPERM:
4275 case X86ISD::VPERMV:
4276 case X86ISD::VPERMV3:
4277 case X86ISD::VPERMIV3:
4279 // 'Faux' Target Shuffles.
4286 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4287 MachineFunction &MF = DAG.getMachineFunction();
4288 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4289 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4290 int ReturnAddrIndex = FuncInfo->getRAIndex();
4292 if (ReturnAddrIndex == 0) {
4293 // Set up a frame object for the return address.
4294 unsigned SlotSize = RegInfo->getSlotSize();
4295 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4298 FuncInfo->setRAIndex(ReturnAddrIndex);
4301 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4304 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4305 bool hasSymbolicDisplacement) {
4306 // Offset should fit into 32 bit immediate field.
4307 if (!isInt<32>(Offset))
4310 // If we don't have a symbolic displacement - we don't have any extra
4312 if (!hasSymbolicDisplacement)
4315 // FIXME: Some tweaks might be needed for medium code model.
4316 if (M != CodeModel::Small && M != CodeModel::Kernel)
4319 // For small code model we assume that latest object is 16MB before end of 31
4320 // bits boundary. We may also accept pretty large negative constants knowing
4321 // that all objects are in the positive half of address space.
4322 if (M == CodeModel::Small && Offset < 16*1024*1024)
4325 // For kernel code model we know that all object resist in the negative half
4326 // of 32bits address space. We may not accept negative offsets, since they may
4327 // be just off and we may accept pretty large positive ones.
4328 if (M == CodeModel::Kernel && Offset >= 0)
4334 /// Determines whether the callee is required to pop its own arguments.
4335 /// Callee pop is necessary to support tail calls.
4336 bool X86::isCalleePop(CallingConv::ID CallingConv,
4337 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4338 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4339 // can guarantee TCO.
4340 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4343 switch (CallingConv) {
4346 case CallingConv::X86_StdCall:
4347 case CallingConv::X86_FastCall:
4348 case CallingConv::X86_ThisCall:
4349 case CallingConv::X86_VectorCall:
4354 /// \brief Return true if the condition is an unsigned comparison operation.
4355 static bool isX86CCUnsigned(unsigned X86CC) {
4358 llvm_unreachable("Invalid integer condition!");
4374 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4375 switch (SetCCOpcode) {
4376 default: llvm_unreachable("Invalid integer condition!");
4377 case ISD::SETEQ: return X86::COND_E;
4378 case ISD::SETGT: return X86::COND_G;
4379 case ISD::SETGE: return X86::COND_GE;
4380 case ISD::SETLT: return X86::COND_L;
4381 case ISD::SETLE: return X86::COND_LE;
4382 case ISD::SETNE: return X86::COND_NE;
4383 case ISD::SETULT: return X86::COND_B;
4384 case ISD::SETUGT: return X86::COND_A;
4385 case ISD::SETULE: return X86::COND_BE;
4386 case ISD::SETUGE: return X86::COND_AE;
4390 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4391 /// condition code, returning the condition code and the LHS/RHS of the
4392 /// comparison to make.
4393 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4394 bool isFP, SDValue &LHS, SDValue &RHS,
4395 SelectionDAG &DAG) {
4397 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4398 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4399 // X > -1 -> X == 0, jump !sign.
4400 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4401 return X86::COND_NS;
4403 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4404 // X < 0 -> X == 0, jump on sign.
4407 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4409 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4410 return X86::COND_LE;
4414 return TranslateIntegerX86CC(SetCCOpcode);
4417 // First determine if it is required or is profitable to flip the operands.
4419 // If LHS is a foldable load, but RHS is not, flip the condition.
4420 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4421 !ISD::isNON_EXTLoad(RHS.getNode())) {
4422 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4423 std::swap(LHS, RHS);
4426 switch (SetCCOpcode) {
4432 std::swap(LHS, RHS);
4436 // On a floating point condition, the flags are set as follows:
4438 // 0 | 0 | 0 | X > Y
4439 // 0 | 0 | 1 | X < Y
4440 // 1 | 0 | 0 | X == Y
4441 // 1 | 1 | 1 | unordered
4442 switch (SetCCOpcode) {
4443 default: llvm_unreachable("Condcode should be pre-legalized away");
4445 case ISD::SETEQ: return X86::COND_E;
4446 case ISD::SETOLT: // flipped
4448 case ISD::SETGT: return X86::COND_A;
4449 case ISD::SETOLE: // flipped
4451 case ISD::SETGE: return X86::COND_AE;
4452 case ISD::SETUGT: // flipped
4454 case ISD::SETLT: return X86::COND_B;
4455 case ISD::SETUGE: // flipped
4457 case ISD::SETLE: return X86::COND_BE;
4459 case ISD::SETNE: return X86::COND_NE;
4460 case ISD::SETUO: return X86::COND_P;
4461 case ISD::SETO: return X86::COND_NP;
4463 case ISD::SETUNE: return X86::COND_INVALID;
4467 /// Is there a floating point cmov for the specific X86 condition code?
4468 /// Current x86 isa includes the following FP cmov instructions:
4469 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4470 static bool hasFPCMov(unsigned X86CC) {
4487 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4489 unsigned Intrinsic) const {
4491 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4495 Info.opc = ISD::INTRINSIC_W_CHAIN;
4496 Info.readMem = false;
4497 Info.writeMem = false;
4501 switch (IntrData->Type) {
4502 case EXPAND_FROM_MEM: {
4503 Info.ptrVal = I.getArgOperand(0);
4504 Info.memVT = MVT::getVT(I.getType());
4506 Info.readMem = true;
4509 case COMPRESS_TO_MEM: {
4510 Info.ptrVal = I.getArgOperand(0);
4511 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4513 Info.writeMem = true;
4516 case TRUNCATE_TO_MEM_VI8:
4517 case TRUNCATE_TO_MEM_VI16:
4518 case TRUNCATE_TO_MEM_VI32: {
4519 Info.ptrVal = I.getArgOperand(0);
4520 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4521 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4522 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4524 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4525 ScalarVT = MVT::i16;
4526 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4527 ScalarVT = MVT::i32;
4529 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4531 Info.writeMem = true;
4541 /// Returns true if the target can instruction select the
4542 /// specified FP immediate natively. If false, the legalizer will
4543 /// materialize the FP immediate as a load from a constant pool.
4544 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4545 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4546 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4552 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4553 ISD::LoadExtType ExtTy,
4555 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4556 // relocation target a movq or addq instruction: don't let the load shrink.
4557 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4558 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4559 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4560 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4564 /// \brief Returns true if it is beneficial to convert a load of a constant
4565 /// to just the constant itself.
4566 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4568 assert(Ty->isIntegerTy());
4570 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4571 if (BitSize == 0 || BitSize > 64)
4576 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4577 unsigned Index) const {
4578 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4581 return (Index == 0 || Index == ResVT.getVectorNumElements());
4584 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4585 // Speculate cttz only if we can directly use TZCNT.
4586 return Subtarget.hasBMI();
4589 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4590 // Speculate ctlz only if we can directly use LZCNT.
4591 return Subtarget.hasLZCNT();
4594 bool X86TargetLowering::isCtlzFast() const {
4595 return Subtarget.hasFastLZCNT();
4598 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4599 const Instruction &AndI) const {
4603 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4604 if (!Subtarget.hasBMI())
4607 // There are only 32-bit and 64-bit forms for 'andn'.
4608 EVT VT = Y.getValueType();
4609 if (VT != MVT::i32 && VT != MVT::i64)
4615 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4616 MVT VT = MVT::getIntegerVT(NumBits);
4617 if (isTypeLegal(VT))
4620 // PMOVMSKB can handle this.
4621 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4624 // VPMOVMSKB can handle this.
4625 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4628 // TODO: Allow 64-bit type for 32-bit target.
4629 // TODO: 512-bit types should be allowed, but make sure that those
4630 // cases are handled in combineVectorSizedSetCCEquality().
4632 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4635 /// Val is the undef sentinel value or equal to the specified value.
4636 static bool isUndefOrEqual(int Val, int CmpVal) {
4637 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4640 /// Val is either the undef or zero sentinel value.
4641 static bool isUndefOrZero(int Val) {
4642 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4645 /// Return true if every element in Mask, beginning
4646 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4647 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4648 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4649 if (Mask[i] != SM_SentinelUndef)
4654 /// Return true if Val is undef or if its value falls within the
4655 /// specified range (L, H].
4656 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4657 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4660 /// Return true if every element in Mask is undef or if its value
4661 /// falls within the specified range (L, H].
4662 static bool isUndefOrInRange(ArrayRef<int> Mask,
4665 if (!isUndefOrInRange(M, Low, Hi))
4670 /// Return true if Val is undef, zero or if its value falls within the
4671 /// specified range (L, H].
4672 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4673 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4676 /// Return true if every element in Mask is undef, zero or if its value
4677 /// falls within the specified range (L, H].
4678 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4680 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4685 /// Return true if every element in Mask, beginning
4686 /// from position Pos and ending in Pos+Size, falls within the specified
4687 /// sequential range (Low, Low+Size]. or is undef.
4688 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4689 unsigned Pos, unsigned Size, int Low) {
4690 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4691 if (!isUndefOrEqual(Mask[i], Low))
4696 /// Return true if every element in Mask, beginning
4697 /// from position Pos and ending in Pos+Size, falls within the specified
4698 /// sequential range (Low, Low+Size], or is undef or is zero.
4699 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4700 unsigned Size, int Low) {
4701 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4702 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4707 /// Return true if every element in Mask, beginning
4708 /// from position Pos and ending in Pos+Size is undef or is zero.
4709 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4711 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4712 if (!isUndefOrZero(Mask[i]))
4717 /// \brief Helper function to test whether a shuffle mask could be
4718 /// simplified by widening the elements being shuffled.
4720 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4721 /// leaves it in an unspecified state.
4723 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4724 /// shuffle masks. The latter have the special property of a '-2' representing
4725 /// a zero-ed lane of a vector.
4726 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4727 SmallVectorImpl<int> &WidenedMask) {
4728 WidenedMask.assign(Mask.size() / 2, 0);
4729 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4731 int M1 = Mask[i + 1];
4733 // If both elements are undef, its trivial.
4734 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4735 WidenedMask[i / 2] = SM_SentinelUndef;
4739 // Check for an undef mask and a mask value properly aligned to fit with
4740 // a pair of values. If we find such a case, use the non-undef mask's value.
4741 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4742 WidenedMask[i / 2] = M1 / 2;
4745 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4746 WidenedMask[i / 2] = M0 / 2;
4750 // When zeroing, we need to spread the zeroing across both lanes to widen.
4751 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4752 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4753 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4754 WidenedMask[i / 2] = SM_SentinelZero;
4760 // Finally check if the two mask values are adjacent and aligned with
4762 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4763 WidenedMask[i / 2] = M0 / 2;
4767 // Otherwise we can't safely widen the elements used in this shuffle.
4770 assert(WidenedMask.size() == Mask.size() / 2 &&
4771 "Incorrect size of mask after widening the elements!");
4776 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4777 /// mask index with the scaled sequential indices for an equivalent narrowed
4778 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4780 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4781 SmallVectorImpl<int> &ScaledMask) {
4782 assert(0 < Scale && "Unexpected scaling factor");
4783 int NumElts = Mask.size();
4784 ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
4786 for (int i = 0; i != NumElts; ++i) {
4789 // Repeat sentinel values in every mask element.
4791 for (int s = 0; s != Scale; ++s)
4792 ScaledMask[(Scale * i) + s] = M;
4796 // Scale mask element and increment across each mask element.
4797 for (int s = 0; s != Scale; ++s)
4798 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4802 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4803 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4804 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4805 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4806 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4809 // The index should be aligned on a vecWidth-bit boundary.
4810 uint64_t Index = N->getConstantOperandVal(1);
4811 MVT VT = N->getSimpleValueType(0);
4812 unsigned ElSize = VT.getScalarSizeInBits();
4813 return (Index * ElSize) % vecWidth == 0;
4816 /// Return true if the specified INSERT_SUBVECTOR
4817 /// operand specifies a subvector insert that is suitable for input to
4818 /// insertion of 128 or 256-bit subvectors
4819 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4820 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4821 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4824 // The index should be aligned on a vecWidth-bit boundary.
4825 uint64_t Index = N->getConstantOperandVal(2);
4826 MVT VT = N->getSimpleValueType(0);
4827 unsigned ElSize = VT.getScalarSizeInBits();
4828 return (Index * ElSize) % vecWidth == 0;
4831 bool X86::isVINSERT128Index(SDNode *N) {
4832 return isVINSERTIndex(N, 128);
4835 bool X86::isVINSERT256Index(SDNode *N) {
4836 return isVINSERTIndex(N, 256);
4839 bool X86::isVEXTRACT128Index(SDNode *N) {
4840 return isVEXTRACTIndex(N, 128);
4843 bool X86::isVEXTRACT256Index(SDNode *N) {
4844 return isVEXTRACTIndex(N, 256);
4847 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4848 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4849 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4850 "Illegal extract subvector for VEXTRACT");
4852 uint64_t Index = N->getConstantOperandVal(1);
4853 MVT VecVT = N->getOperand(0).getSimpleValueType();
4854 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4855 return Index / NumElemsPerChunk;
4858 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4859 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4860 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4861 "Illegal insert subvector for VINSERT");
4863 uint64_t Index = N->getConstantOperandVal(2);
4864 MVT VecVT = N->getSimpleValueType(0);
4865 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4866 return Index / NumElemsPerChunk;
4869 /// Return the appropriate immediate to extract the specified
4870 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4871 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4872 return getExtractVEXTRACTImmediate(N, 128);
4875 /// Return the appropriate immediate to extract the specified
4876 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4877 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4878 return getExtractVEXTRACTImmediate(N, 256);
4881 /// Return the appropriate immediate to insert at the specified
4882 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4883 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4884 return getInsertVINSERTImmediate(N, 128);
4887 /// Return the appropriate immediate to insert at the specified
4888 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4889 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4890 return getInsertVINSERTImmediate(N, 256);
4893 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4894 bool X86::isZeroNode(SDValue Elt) {
4895 return isNullConstant(Elt) || isNullFPConstant(Elt);
4898 // Build a vector of constants.
4899 // Use an UNDEF node if MaskElt == -1.
4900 // Split 64-bit constants in the 32-bit mode.
4901 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4902 const SDLoc &dl, bool IsMask = false) {
4904 SmallVector<SDValue, 32> Ops;
4907 MVT ConstVecVT = VT;
4908 unsigned NumElts = VT.getVectorNumElements();
4909 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4910 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4911 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4915 MVT EltVT = ConstVecVT.getVectorElementType();
4916 for (unsigned i = 0; i < NumElts; ++i) {
4917 bool IsUndef = Values[i] < 0 && IsMask;
4918 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4919 DAG.getConstant(Values[i], dl, EltVT);
4920 Ops.push_back(OpNode);
4922 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4923 DAG.getConstant(0, dl, EltVT));
4925 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4927 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4931 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4932 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4933 assert(Bits.size() == Undefs.getBitWidth() &&
4934 "Unequal constant and undef arrays");
4935 SmallVector<SDValue, 32> Ops;
4938 MVT ConstVecVT = VT;
4939 unsigned NumElts = VT.getVectorNumElements();
4940 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4941 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4942 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4946 MVT EltVT = ConstVecVT.getVectorElementType();
4947 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4949 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4952 const APInt &V = Bits[i];
4953 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4955 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4956 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4957 } else if (EltVT == MVT::f32) {
4958 APFloat FV(APFloat::IEEEsingle(), V);
4959 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4960 } else if (EltVT == MVT::f64) {
4961 APFloat FV(APFloat::IEEEdouble(), V);
4962 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4964 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4968 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4969 return DAG.getBitcast(VT, ConstsNode);
4972 /// Returns a vector of specified type with all zero elements.
4973 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4974 SelectionDAG &DAG, const SDLoc &dl) {
4975 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4976 VT.getVectorElementType() == MVT::i1) &&
4977 "Unexpected vector type");
4979 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4980 // type. This ensures they get CSE'd. But if the integer type is not
4981 // available, use a floating-point +0.0 instead.
4983 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4984 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4985 } else if (VT.getVectorElementType() == MVT::i1) {
4986 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4987 "Unexpected vector type");
4988 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4989 "Unexpected vector type");
4990 Vec = DAG.getConstant(0, dl, VT);
4992 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4993 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
4995 return DAG.getBitcast(VT, Vec);
4998 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
4999 const SDLoc &dl, unsigned vectorWidth) {
5000 EVT VT = Vec.getValueType();
5001 EVT ElVT = VT.getVectorElementType();
5002 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5003 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5004 VT.getVectorNumElements()/Factor);
5006 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5007 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5008 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5010 // This is the index of the first element of the vectorWidth-bit chunk
5011 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5012 IdxVal &= ~(ElemsPerChunk - 1);
5014 // If the input is a buildvector just emit a smaller one.
5015 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5016 return DAG.getBuildVector(
5017 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5019 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5020 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5023 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5024 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5025 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5026 /// instructions or a simple subregister reference. Idx is an index in the
5027 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5028 /// lowering EXTRACT_VECTOR_ELT operations easier.
5029 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5030 SelectionDAG &DAG, const SDLoc &dl) {
5031 assert((Vec.getValueType().is256BitVector() ||
5032 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5033 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5036 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5037 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5038 SelectionDAG &DAG, const SDLoc &dl) {
5039 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5040 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5043 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5044 SelectionDAG &DAG, const SDLoc &dl,
5045 unsigned vectorWidth) {
5046 assert((vectorWidth == 128 || vectorWidth == 256) &&
5047 "Unsupported vector width");
5048 // Inserting UNDEF is Result
5051 EVT VT = Vec.getValueType();
5052 EVT ElVT = VT.getVectorElementType();
5053 EVT ResultVT = Result.getValueType();
5055 // Insert the relevant vectorWidth bits.
5056 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5057 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5059 // This is the index of the first element of the vectorWidth-bit chunk
5060 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5061 IdxVal &= ~(ElemsPerChunk - 1);
5063 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5064 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5067 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5068 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5069 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5070 /// simple superregister reference. Idx is an index in the 128 bits
5071 /// we want. It need not be aligned to a 128-bit boundary. That makes
5072 /// lowering INSERT_VECTOR_ELT operations easier.
5073 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5074 SelectionDAG &DAG, const SDLoc &dl) {
5075 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5076 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5079 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5080 SelectionDAG &DAG, const SDLoc &dl) {
5081 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5082 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5085 // Return true if the instruction zeroes the unused upper part of the
5086 // destination and accepts mask.
5087 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5091 case X86ISD::PCMPEQM:
5092 case X86ISD::PCMPGTM:
5099 /// Insert i1-subvector to i1-vector.
5100 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5101 const X86Subtarget &Subtarget) {
5104 SDValue Vec = Op.getOperand(0);
5105 SDValue SubVec = Op.getOperand(1);
5106 SDValue Idx = Op.getOperand(2);
5108 if (!isa<ConstantSDNode>(Idx))
5111 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5112 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5115 MVT OpVT = Op.getSimpleValueType();
5116 MVT SubVecVT = SubVec.getSimpleValueType();
5117 unsigned NumElems = OpVT.getVectorNumElements();
5118 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5120 assert(IdxVal + SubVecNumElems <= NumElems &&
5121 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5122 "Unexpected index value in INSERT_SUBVECTOR");
5124 // There are 3 possible cases:
5125 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5126 // 2. Subvector should be inserted in the upper part
5127 // (IdxVal + SubVecNumElems == NumElems)
5128 // 3. Subvector should be inserted in the middle (for example v2i1
5129 // to v16i1, index 2)
5131 // If this node widens - by concatenating zeroes - the type of the result
5132 // of a node with instruction that zeroes all upper (irrelevant) bits of the
5133 // output register, mark this node as legal to enable replacing them with
5134 // the v8i1 version of the previous instruction during instruction selection.
5135 // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5136 // while zeroing all the upper remaining 60 bits of the register. if the
5137 // result of such instruction is inserted into an allZeroVector, then we can
5138 // safely remove insert_vector (in instruction selection) as the cmp instr
5139 // already zeroed the rest of the register.
5140 if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5141 (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5142 (SubVec.getOpcode() == ISD::AND &&
5143 (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5144 isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5147 // extend to natively supported kshift
5148 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5149 MVT WideOpVT = OpVT;
5150 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5153 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5154 SDValue Undef = DAG.getUNDEF(WideOpVT);
5155 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5156 Undef, SubVec, ZeroIdx);
5158 // Extract sub-vector if require.
5159 auto ExtractSubVec = [&](SDValue V) {
5160 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5164 if (Vec.isUndef()) {
5166 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5167 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5170 return ExtractSubVec(WideSubVec);
5173 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5174 NumElems = WideOpVT.getVectorNumElements();
5175 unsigned ShiftLeft = NumElems - SubVecNumElems;
5176 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5177 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5178 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5179 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5180 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5181 return ExtractSubVec(Vec);
5185 // Zero lower bits of the Vec
5186 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5187 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5188 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5189 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5190 // Merge them together, SubVec should be zero extended.
5191 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5192 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5194 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5195 return ExtractSubVec(Vec);
5198 // Simple case when we put subvector in the upper part
5199 if (IdxVal + SubVecNumElems == NumElems) {
5200 // Zero upper bits of the Vec
5201 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5202 DAG.getConstant(IdxVal, dl, MVT::i8));
5203 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5204 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5205 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5206 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5207 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5208 return ExtractSubVec(Vec);
5210 // Subvector should be inserted in the middle - use shuffle
5211 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5213 SmallVector<int, 64> Mask;
5214 for (unsigned i = 0; i < NumElems; ++i)
5215 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5217 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5220 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5221 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5222 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5223 /// large BUILD_VECTORS.
5224 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5225 unsigned NumElems, SelectionDAG &DAG,
5227 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5228 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5231 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5232 unsigned NumElems, SelectionDAG &DAG,
5234 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5235 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5238 /// Returns a vector of specified type with all bits set.
5239 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5240 /// Then bitcast to their original type, ensuring they get CSE'd.
5241 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5242 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5243 "Expected a 128/256/512-bit vector type");
5245 APInt Ones = APInt::getAllOnesValue(32);
5246 unsigned NumElts = VT.getSizeInBits() / 32;
5247 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5248 return DAG.getBitcast(VT, Vec);
5251 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5252 SelectionDAG &DAG) {
5253 EVT InVT = In.getValueType();
5254 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5256 if (VT.is128BitVector() && InVT.is128BitVector())
5257 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5258 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5260 // For 256-bit vectors, we only need the lower (128-bit) input half.
5261 // For 512-bit vectors, we only need the lower input half or quarter.
5262 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5263 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5264 In = extractSubVector(In, 0, DAG, DL,
5265 std::max(128, (int)VT.getSizeInBits() / Scale));
5268 return DAG.getNode(Opc, DL, VT, In);
5271 /// Generate unpacklo/unpackhi shuffle mask.
5272 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5274 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5275 int NumElts = VT.getVectorNumElements();
5276 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5278 for (int i = 0; i < NumElts; ++i) {
5279 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5280 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5281 Pos += (Unary ? 0 : NumElts * (i % 2));
5282 Pos += (Lo ? 0 : NumEltsInLane / 2);
5283 Mask.push_back(Pos);
5287 /// Returns a vector_shuffle node for an unpackl operation.
5288 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5289 SDValue V1, SDValue V2) {
5290 SmallVector<int, 8> Mask;
5291 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5292 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5295 /// Returns a vector_shuffle node for an unpackh operation.
5296 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5297 SDValue V1, SDValue V2) {
5298 SmallVector<int, 8> Mask;
5299 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5300 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5303 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5304 /// This produces a shuffle where the low element of V2 is swizzled into the
5305 /// zero/undef vector, landing at element Idx.
5306 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5307 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5309 const X86Subtarget &Subtarget,
5310 SelectionDAG &DAG) {
5311 MVT VT = V2.getSimpleValueType();
5313 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5314 int NumElems = VT.getVectorNumElements();
5315 SmallVector<int, 16> MaskVec(NumElems);
5316 for (int i = 0; i != NumElems; ++i)
5317 // If this is the insertion idx, put the low elt of V2 here.
5318 MaskVec[i] = (i == Idx) ? NumElems : i;
5319 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5322 static SDValue peekThroughBitcasts(SDValue V) {
5323 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5324 V = V.getOperand(0);
5328 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5329 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5330 V.getOperand(0).hasOneUse())
5331 V = V.getOperand(0);
5335 static const Constant *getTargetConstantFromNode(SDValue Op) {
5336 Op = peekThroughBitcasts(Op);
5338 auto *Load = dyn_cast<LoadSDNode>(Op);
5342 SDValue Ptr = Load->getBasePtr();
5343 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5344 Ptr->getOpcode() == X86ISD::WrapperRIP)
5345 Ptr = Ptr->getOperand(0);
5347 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5348 if (!CNode || CNode->isMachineConstantPoolEntry())
5351 return dyn_cast<Constant>(CNode->getConstVal());
5354 // Extract raw constant bits from constant pools.
5355 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5357 SmallVectorImpl<APInt> &EltBits,
5358 bool AllowWholeUndefs = true,
5359 bool AllowPartialUndefs = true) {
5360 assert(EltBits.empty() && "Expected an empty EltBits vector");
5362 Op = peekThroughBitcasts(Op);
5364 EVT VT = Op.getValueType();
5365 unsigned SizeInBits = VT.getSizeInBits();
5366 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5367 unsigned NumElts = SizeInBits / EltSizeInBits;
5369 // Bitcast a source array of element bits to the target size.
5370 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5371 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5372 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5373 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5374 "Constant bit sizes don't match");
5376 // Don't split if we don't allow undef bits.
5377 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5378 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5381 // If we're already the right size, don't bother bitcasting.
5382 if (NumSrcElts == NumElts) {
5383 UndefElts = UndefSrcElts;
5384 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5388 // Extract all the undef/constant element data and pack into single bitsets.
5389 APInt UndefBits(SizeInBits, 0);
5390 APInt MaskBits(SizeInBits, 0);
5392 for (unsigned i = 0; i != NumSrcElts; ++i) {
5393 unsigned BitOffset = i * SrcEltSizeInBits;
5394 if (UndefSrcElts[i])
5395 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5396 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5399 // Split the undef/constant single bitset data into the target elements.
5400 UndefElts = APInt(NumElts, 0);
5401 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5403 for (unsigned i = 0; i != NumElts; ++i) {
5404 unsigned BitOffset = i * EltSizeInBits;
5405 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5407 // Only treat an element as UNDEF if all bits are UNDEF.
5408 if (UndefEltBits.isAllOnesValue()) {
5409 if (!AllowWholeUndefs)
5411 UndefElts.setBit(i);
5415 // If only some bits are UNDEF then treat them as zero (or bail if not
5417 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5420 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5421 EltBits[i] = Bits.getZExtValue();
5426 // Collect constant bits and insert into mask/undef bit masks.
5427 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5428 unsigned UndefBitIndex) {
5431 if (isa<UndefValue>(Cst)) {
5432 Undefs.setBit(UndefBitIndex);
5435 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5436 Mask = CInt->getValue();
5439 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5440 Mask = CFP->getValueAPF().bitcastToAPInt();
5446 // Extract constant bits from build vector.
5447 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5448 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5449 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5451 APInt UndefSrcElts(NumSrcElts, 0);
5452 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5453 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5454 const SDValue &Src = Op.getOperand(i);
5455 if (Src.isUndef()) {
5456 UndefSrcElts.setBit(i);
5459 auto *Cst = cast<ConstantSDNode>(Src);
5460 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5462 return CastBitData(UndefSrcElts, SrcEltBits);
5465 // Extract constant bits from constant pool vector.
5466 if (auto *Cst = getTargetConstantFromNode(Op)) {
5467 Type *CstTy = Cst->getType();
5468 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5471 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5472 unsigned NumSrcElts = CstTy->getVectorNumElements();
5474 APInt UndefSrcElts(NumSrcElts, 0);
5475 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5476 for (unsigned i = 0; i != NumSrcElts; ++i)
5477 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5481 return CastBitData(UndefSrcElts, SrcEltBits);
5484 // Extract constant bits from a broadcasted constant pool scalar.
5485 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5486 EltSizeInBits <= VT.getScalarSizeInBits()) {
5487 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5488 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5489 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5491 APInt UndefSrcElts(NumSrcElts, 0);
5492 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5493 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5494 if (UndefSrcElts[0])
5495 UndefSrcElts.setBits(0, NumSrcElts);
5496 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5497 return CastBitData(UndefSrcElts, SrcEltBits);
5502 // Extract a rematerialized scalar constant insertion.
5503 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5504 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5505 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5506 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5507 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5509 APInt UndefSrcElts(NumSrcElts, 0);
5510 SmallVector<APInt, 64> SrcEltBits;
5511 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5512 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5513 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5514 return CastBitData(UndefSrcElts, SrcEltBits);
5520 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5521 unsigned MaskEltSizeInBits,
5522 SmallVectorImpl<uint64_t> &RawMask) {
5524 SmallVector<APInt, 64> EltBits;
5526 // Extract the raw target constant bits.
5527 // FIXME: We currently don't support UNDEF bits or mask entries.
5528 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5529 EltBits, /* AllowWholeUndefs */ false,
5530 /* AllowPartialUndefs */ false))
5533 // Insert the extracted elements into the mask.
5534 for (APInt Elt : EltBits)
5535 RawMask.push_back(Elt.getZExtValue());
5540 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5541 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5542 /// operands in \p Ops, and returns true.
5543 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5544 /// IsUnary for shuffles which use a single input multiple times, and in those
5545 /// cases it will adjust the mask to only have indices within that single input.
5546 /// It is an error to call this with non-empty Mask/Ops vectors.
5547 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5548 SmallVectorImpl<SDValue> &Ops,
5549 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5550 unsigned NumElems = VT.getVectorNumElements();
5553 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5554 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5557 bool IsFakeUnary = false;
5558 switch(N->getOpcode()) {
5559 case X86ISD::BLENDI:
5560 ImmN = N->getOperand(N->getNumOperands()-1);
5561 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5562 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5565 ImmN = N->getOperand(N->getNumOperands()-1);
5566 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5567 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5569 case X86ISD::INSERTPS:
5570 ImmN = N->getOperand(N->getNumOperands()-1);
5571 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5572 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5574 case X86ISD::EXTRQI:
5575 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5576 isa<ConstantSDNode>(N->getOperand(2))) {
5577 int BitLen = N->getConstantOperandVal(1);
5578 int BitIdx = N->getConstantOperandVal(2);
5579 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5583 case X86ISD::INSERTQI:
5584 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5585 isa<ConstantSDNode>(N->getOperand(3))) {
5586 int BitLen = N->getConstantOperandVal(2);
5587 int BitIdx = N->getConstantOperandVal(3);
5588 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5589 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5592 case X86ISD::UNPCKH:
5593 DecodeUNPCKHMask(VT, Mask);
5594 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5596 case X86ISD::UNPCKL:
5597 DecodeUNPCKLMask(VT, Mask);
5598 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5600 case X86ISD::MOVHLPS:
5601 DecodeMOVHLPSMask(NumElems, Mask);
5602 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5604 case X86ISD::MOVLHPS:
5605 DecodeMOVLHPSMask(NumElems, Mask);
5606 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5608 case X86ISD::PALIGNR:
5609 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5610 ImmN = N->getOperand(N->getNumOperands()-1);
5611 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5612 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5613 Ops.push_back(N->getOperand(1));
5614 Ops.push_back(N->getOperand(0));
5616 case X86ISD::VSHLDQ:
5617 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5618 ImmN = N->getOperand(N->getNumOperands() - 1);
5619 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5622 case X86ISD::VSRLDQ:
5623 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5624 ImmN = N->getOperand(N->getNumOperands() - 1);
5625 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5628 case X86ISD::PSHUFD:
5629 case X86ISD::VPERMILPI:
5630 ImmN = N->getOperand(N->getNumOperands()-1);
5631 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5634 case X86ISD::PSHUFHW:
5635 ImmN = N->getOperand(N->getNumOperands()-1);
5636 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5639 case X86ISD::PSHUFLW:
5640 ImmN = N->getOperand(N->getNumOperands()-1);
5641 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5644 case X86ISD::VZEXT_MOVL:
5645 DecodeZeroMoveLowMask(VT, Mask);
5648 case X86ISD::VBROADCAST: {
5649 SDValue N0 = N->getOperand(0);
5650 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5651 // add the pre-extracted value to the Ops vector.
5652 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5653 N0.getOperand(0).getValueType() == VT &&
5654 N0.getConstantOperandVal(1) == 0)
5655 Ops.push_back(N0.getOperand(0));
5657 // We only decode broadcasts of same-sized vectors, unless the broadcast
5658 // came from an extract from the original width. If we found one, we
5659 // pushed it the Ops vector above.
5660 if (N0.getValueType() == VT || !Ops.empty()) {
5661 DecodeVectorBroadcast(VT, Mask);
5667 case X86ISD::VPERMILPV: {
5669 SDValue MaskNode = N->getOperand(1);
5670 unsigned MaskEltSize = VT.getScalarSizeInBits();
5671 SmallVector<uint64_t, 32> RawMask;
5672 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5673 DecodeVPERMILPMask(VT, RawMask, Mask);
5676 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5677 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5682 case X86ISD::PSHUFB: {
5684 SDValue MaskNode = N->getOperand(1);
5685 SmallVector<uint64_t, 32> RawMask;
5686 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5687 DecodePSHUFBMask(RawMask, Mask);
5690 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5691 DecodePSHUFBMask(C, Mask);
5696 case X86ISD::VPERMI:
5697 ImmN = N->getOperand(N->getNumOperands()-1);
5698 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5703 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5705 case X86ISD::VPERM2X128:
5706 ImmN = N->getOperand(N->getNumOperands()-1);
5707 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5708 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5710 case X86ISD::MOVSLDUP:
5711 DecodeMOVSLDUPMask(VT, Mask);
5714 case X86ISD::MOVSHDUP:
5715 DecodeMOVSHDUPMask(VT, Mask);
5718 case X86ISD::MOVDDUP:
5719 DecodeMOVDDUPMask(VT, Mask);
5722 case X86ISD::MOVLHPD:
5723 case X86ISD::MOVLPD:
5724 case X86ISD::MOVLPS:
5725 // Not yet implemented
5727 case X86ISD::VPERMIL2: {
5728 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5729 unsigned MaskEltSize = VT.getScalarSizeInBits();
5730 SDValue MaskNode = N->getOperand(2);
5731 SDValue CtrlNode = N->getOperand(3);
5732 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5733 unsigned CtrlImm = CtrlOp->getZExtValue();
5734 SmallVector<uint64_t, 32> RawMask;
5735 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5736 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5739 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5740 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5746 case X86ISD::VPPERM: {
5747 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5748 SDValue MaskNode = N->getOperand(2);
5749 SmallVector<uint64_t, 32> RawMask;
5750 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5751 DecodeVPPERMMask(RawMask, Mask);
5754 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5755 DecodeVPPERMMask(C, Mask);
5760 case X86ISD::VPERMV: {
5762 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5763 Ops.push_back(N->getOperand(1));
5764 SDValue MaskNode = N->getOperand(0);
5765 SmallVector<uint64_t, 32> RawMask;
5766 unsigned MaskEltSize = VT.getScalarSizeInBits();
5767 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5768 DecodeVPERMVMask(RawMask, Mask);
5771 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5772 DecodeVPERMVMask(C, MaskEltSize, Mask);
5777 case X86ISD::VPERMV3: {
5778 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5779 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5780 Ops.push_back(N->getOperand(0));
5781 Ops.push_back(N->getOperand(2));
5782 SDValue MaskNode = N->getOperand(1);
5783 unsigned MaskEltSize = VT.getScalarSizeInBits();
5784 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5785 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5790 case X86ISD::VPERMIV3: {
5791 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5792 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5793 Ops.push_back(N->getOperand(1));
5794 Ops.push_back(N->getOperand(2));
5795 SDValue MaskNode = N->getOperand(0);
5796 unsigned MaskEltSize = VT.getScalarSizeInBits();
5797 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5798 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5803 default: llvm_unreachable("unknown target shuffle node");
5806 // Empty mask indicates the decode failed.
5810 // Check if we're getting a shuffle mask with zero'd elements.
5811 if (!AllowSentinelZero)
5812 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5815 // If we have a fake unary shuffle, the shuffle mask is spread across two
5816 // inputs that are actually the same node. Re-map the mask to always point
5817 // into the first input.
5820 if (M >= (int)Mask.size())
5823 // If we didn't already add operands in the opcode-specific code, default to
5824 // adding 1 or 2 operands starting at 0.
5826 Ops.push_back(N->getOperand(0));
5827 if (!IsUnary || IsFakeUnary)
5828 Ops.push_back(N->getOperand(1));
5834 /// Check a target shuffle mask's inputs to see if we can set any values to
5835 /// SM_SentinelZero - this is for elements that are known to be zero
5836 /// (not just zeroable) from their inputs.
5837 /// Returns true if the target shuffle mask was decoded.
5838 static bool setTargetShuffleZeroElements(SDValue N,
5839 SmallVectorImpl<int> &Mask,
5840 SmallVectorImpl<SDValue> &Ops) {
5842 if (!isTargetShuffle(N.getOpcode()))
5845 MVT VT = N.getSimpleValueType();
5846 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5849 SDValue V1 = Ops[0];
5850 SDValue V2 = IsUnary ? V1 : Ops[1];
5852 V1 = peekThroughBitcasts(V1);
5853 V2 = peekThroughBitcasts(V2);
5855 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5856 "Illegal split of shuffle value type");
5857 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5859 // Extract known constant input data.
5860 APInt UndefSrcElts[2];
5861 SmallVector<APInt, 32> SrcEltBits[2];
5862 bool IsSrcConstant[2] = {
5863 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5864 SrcEltBits[0], true, false),
5865 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5866 SrcEltBits[1], true, false)};
5868 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5871 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5875 // Determine shuffle input and normalize the mask.
5876 unsigned SrcIdx = M / Size;
5877 SDValue V = M < Size ? V1 : V2;
5880 // We are referencing an UNDEF input.
5882 Mask[i] = SM_SentinelUndef;
5886 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5887 // TODO: We currently only set UNDEF for integer types - floats use the same
5888 // registers as vectors and many of the scalar folded loads rely on the
5889 // SCALAR_TO_VECTOR pattern.
5890 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5891 (Size % V.getValueType().getVectorNumElements()) == 0) {
5892 int Scale = Size / V.getValueType().getVectorNumElements();
5893 int Idx = M / Scale;
5894 if (Idx != 0 && !VT.isFloatingPoint())
5895 Mask[i] = SM_SentinelUndef;
5896 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5897 Mask[i] = SM_SentinelZero;
5901 // Attempt to extract from the source's constant bits.
5902 if (IsSrcConstant[SrcIdx]) {
5903 if (UndefSrcElts[SrcIdx][M])
5904 Mask[i] = SM_SentinelUndef;
5905 else if (SrcEltBits[SrcIdx][M] == 0)
5906 Mask[i] = SM_SentinelZero;
5910 assert(VT.getVectorNumElements() == Mask.size() &&
5911 "Different mask size from vector size!");
5915 // Attempt to decode ops that could be represented as a shuffle mask.
5916 // The decoded shuffle mask may contain a different number of elements to the
5917 // destination value type.
5918 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5919 SmallVectorImpl<SDValue> &Ops,
5920 SelectionDAG &DAG) {
5924 MVT VT = N.getSimpleValueType();
5925 unsigned NumElts = VT.getVectorNumElements();
5926 unsigned NumSizeInBits = VT.getSizeInBits();
5927 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5928 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5929 "Expected byte aligned value types");
5931 unsigned Opcode = N.getOpcode();
5934 case X86ISD::ANDNP: {
5935 // Attempt to decode as a per-byte mask.
5937 SmallVector<APInt, 32> EltBits;
5938 SDValue N0 = N.getOperand(0);
5939 SDValue N1 = N.getOperand(1);
5940 bool IsAndN = (X86ISD::ANDNP == Opcode);
5941 uint64_t ZeroMask = IsAndN ? 255 : 0;
5942 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5944 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5946 Mask.push_back(SM_SentinelUndef);
5949 uint64_t ByteBits = EltBits[i].getZExtValue();
5950 if (ByteBits != 0 && ByteBits != 255)
5952 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5954 Ops.push_back(IsAndN ? N1 : N0);
5957 case ISD::SCALAR_TO_VECTOR: {
5958 // Match against a scalar_to_vector of an extract from a vector,
5959 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5960 SDValue N0 = N.getOperand(0);
5963 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5964 N0.getOperand(0).getValueType() == VT) {
5966 } else if (N0.getOpcode() == ISD::AssertZext &&
5967 N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5968 cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
5969 SrcExtract = N0.getOperand(0);
5970 assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
5971 } else if (N0.getOpcode() == ISD::AssertZext &&
5972 N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5973 cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
5974 SrcExtract = N0.getOperand(0);
5975 assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
5978 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5981 SDValue SrcVec = SrcExtract.getOperand(0);
5982 EVT SrcVT = SrcVec.getValueType();
5983 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5984 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5986 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5987 if (NumSrcElts <= SrcIdx)
5990 Ops.push_back(SrcVec);
5991 Mask.push_back(SrcIdx);
5992 Mask.append(NumZeros, SM_SentinelZero);
5993 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
5996 case X86ISD::PINSRB:
5997 case X86ISD::PINSRW: {
5998 SDValue InVec = N.getOperand(0);
5999 SDValue InScl = N.getOperand(1);
6000 uint64_t InIdx = N.getConstantOperandVal(2);
6001 assert(InIdx < NumElts && "Illegal insertion index");
6003 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6004 if (X86::isZeroNode(InScl)) {
6005 Ops.push_back(InVec);
6006 for (unsigned i = 0; i != NumElts; ++i)
6007 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6011 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
6012 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6014 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6015 if (InScl.getOpcode() != ISD::AssertZext ||
6016 InScl.getOperand(0).getOpcode() != ExOp)
6019 SDValue ExVec = InScl.getOperand(0).getOperand(0);
6020 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
6021 assert(ExIdx < NumElts && "Illegal extraction index");
6022 Ops.push_back(InVec);
6023 Ops.push_back(ExVec);
6024 for (unsigned i = 0; i != NumElts; ++i)
6025 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6028 case X86ISD::PACKSS: {
6029 // If we know input saturation won't happen we can treat this
6030 // as a truncation shuffle.
6031 if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
6032 DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
6035 Ops.push_back(N.getOperand(0));
6036 Ops.push_back(N.getOperand(1));
6037 for (unsigned i = 0; i != NumElts; ++i)
6038 Mask.push_back(i * 2);
6042 case X86ISD::VSRLI: {
6043 uint64_t ShiftVal = N.getConstantOperandVal(1);
6044 // Out of range bit shifts are guaranteed to be zero.
6045 if (NumBitsPerElt <= ShiftVal) {
6046 Mask.append(NumElts, SM_SentinelZero);
6050 // We can only decode 'whole byte' bit shifts as shuffles.
6051 if ((ShiftVal % 8) != 0)
6054 uint64_t ByteShift = ShiftVal / 8;
6055 unsigned NumBytes = NumSizeInBits / 8;
6056 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6057 Ops.push_back(N.getOperand(0));
6059 // Clear mask to all zeros and insert the shifted byte indices.
6060 Mask.append(NumBytes, SM_SentinelZero);
6062 if (X86ISD::VSHLI == Opcode) {
6063 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6064 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6065 Mask[i + j] = i + j - ByteShift;
6067 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6068 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6069 Mask[i + j - ByteShift] = i + j;
6073 case ISD::ZERO_EXTEND_VECTOR_INREG:
6074 case X86ISD::VZEXT: {
6075 // TODO - add support for VPMOVZX with smaller input vector types.
6076 SDValue Src = N.getOperand(0);
6077 MVT SrcVT = Src.getSimpleValueType();
6078 if (NumSizeInBits != SrcVT.getSizeInBits())
6080 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6089 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6090 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6091 SmallVectorImpl<int> &Mask) {
6092 int MaskWidth = Mask.size();
6093 SmallVector<SDValue, 16> UsedInputs;
6094 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6095 int lo = UsedInputs.size() * MaskWidth;
6096 int hi = lo + MaskWidth;
6097 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6098 UsedInputs.push_back(Inputs[i]);
6105 Inputs = UsedInputs;
6108 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6109 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6110 /// remaining input indices in case we now have a unary shuffle and adjust the
6111 /// inputs accordingly.
6112 /// Returns true if the target shuffle mask was decoded.
6113 static bool resolveTargetShuffleInputs(SDValue Op,
6114 SmallVectorImpl<SDValue> &Inputs,
6115 SmallVectorImpl<int> &Mask,
6116 SelectionDAG &DAG) {
6117 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6118 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6121 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6125 /// Returns the scalar element that will make up the ith
6126 /// element of the result of the vector shuffle.
6127 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6130 return SDValue(); // Limit search depth.
6132 SDValue V = SDValue(N, 0);
6133 EVT VT = V.getValueType();
6134 unsigned Opcode = V.getOpcode();
6136 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6137 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6138 int Elt = SV->getMaskElt(Index);
6141 return DAG.getUNDEF(VT.getVectorElementType());
6143 unsigned NumElems = VT.getVectorNumElements();
6144 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6145 : SV->getOperand(1);
6146 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6149 // Recurse into target specific vector shuffles to find scalars.
6150 if (isTargetShuffle(Opcode)) {
6151 MVT ShufVT = V.getSimpleValueType();
6152 MVT ShufSVT = ShufVT.getVectorElementType();
6153 int NumElems = (int)ShufVT.getVectorNumElements();
6154 SmallVector<int, 16> ShuffleMask;
6155 SmallVector<SDValue, 16> ShuffleOps;
6158 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6161 int Elt = ShuffleMask[Index];
6162 if (Elt == SM_SentinelZero)
6163 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6164 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6165 if (Elt == SM_SentinelUndef)
6166 return DAG.getUNDEF(ShufSVT);
6168 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6169 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6170 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6174 // Actual nodes that may contain scalar elements
6175 if (Opcode == ISD::BITCAST) {
6176 V = V.getOperand(0);
6177 EVT SrcVT = V.getValueType();
6178 unsigned NumElems = VT.getVectorNumElements();
6180 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6184 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6185 return (Index == 0) ? V.getOperand(0)
6186 : DAG.getUNDEF(VT.getVectorElementType());
6188 if (V.getOpcode() == ISD::BUILD_VECTOR)
6189 return V.getOperand(Index);
6194 /// Custom lower build_vector of v16i8.
6195 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6196 unsigned NumNonZero, unsigned NumZero,
6198 const X86Subtarget &Subtarget) {
6199 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6206 // SSE4.1 - use PINSRB to insert each byte directly.
6207 if (Subtarget.hasSSE41()) {
6208 for (unsigned i = 0; i < 16; ++i) {
6209 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6211 // If the build vector contains zeros or our first insertion is not the
6212 // first index then insert into zero vector to break any register
6213 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6216 if (NumZero || 0 != i)
6217 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6219 assert(0 == i && "Expected insertion into zero-index");
6220 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6221 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6222 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6223 V = DAG.getBitcast(MVT::v16i8, V);
6227 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6228 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6235 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6236 for (unsigned i = 0; i < 16; ++i) {
6237 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6238 if (ThisIsNonZero && First) {
6240 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6242 V = DAG.getUNDEF(MVT::v8i16);
6247 // FIXME: Investigate extending to i32 instead of just i16.
6248 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6249 SDValue ThisElt, LastElt;
6250 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6251 if (LastIsNonZero) {
6253 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6255 if (ThisIsNonZero) {
6256 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6257 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6258 DAG.getConstant(8, dl, MVT::i8));
6260 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6266 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6267 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6268 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6269 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6270 V = DAG.getBitcast(MVT::v8i16, V);
6272 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6273 DAG.getIntPtrConstant(i / 2, dl));
6279 return DAG.getBitcast(MVT::v16i8, V);
6282 /// Custom lower build_vector of v8i16.
6283 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6284 unsigned NumNonZero, unsigned NumZero,
6286 const X86Subtarget &Subtarget) {
6287 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6293 for (unsigned i = 0; i < 8; ++i) {
6294 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6296 // If the build vector contains zeros or our first insertion is not the
6297 // first index then insert into zero vector to break any register
6298 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6301 if (NumZero || 0 != i)
6302 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6304 assert(0 == i && "Expected insertion into zero-index");
6305 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6306 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6307 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6308 V = DAG.getBitcast(MVT::v8i16, V);
6312 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6313 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6320 /// Custom lower build_vector of v4i32 or v4f32.
6321 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6322 const X86Subtarget &Subtarget) {
6323 // Find all zeroable elements.
6324 std::bitset<4> Zeroable;
6325 for (int i=0; i < 4; ++i) {
6326 SDValue Elt = Op->getOperand(i);
6327 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6329 assert(Zeroable.size() - Zeroable.count() > 1 &&
6330 "We expect at least two non-zero elements!");
6332 // We only know how to deal with build_vector nodes where elements are either
6333 // zeroable or extract_vector_elt with constant index.
6334 SDValue FirstNonZero;
6335 unsigned FirstNonZeroIdx;
6336 for (unsigned i=0; i < 4; ++i) {
6339 SDValue Elt = Op->getOperand(i);
6340 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6341 !isa<ConstantSDNode>(Elt.getOperand(1)))
6343 // Make sure that this node is extracting from a 128-bit vector.
6344 MVT VT = Elt.getOperand(0).getSimpleValueType();
6345 if (!VT.is128BitVector())
6347 if (!FirstNonZero.getNode()) {
6349 FirstNonZeroIdx = i;
6353 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6354 SDValue V1 = FirstNonZero.getOperand(0);
6355 MVT VT = V1.getSimpleValueType();
6357 // See if this build_vector can be lowered as a blend with zero.
6359 unsigned EltMaskIdx, EltIdx;
6361 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6362 if (Zeroable[EltIdx]) {
6363 // The zero vector will be on the right hand side.
6364 Mask[EltIdx] = EltIdx+4;
6368 Elt = Op->getOperand(EltIdx);
6369 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6370 EltMaskIdx = Elt.getConstantOperandVal(1);
6371 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6373 Mask[EltIdx] = EltIdx;
6377 // Let the shuffle legalizer deal with blend operations.
6378 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6379 if (V1.getSimpleValueType() != VT)
6380 V1 = DAG.getBitcast(VT, V1);
6381 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6384 // See if we can lower this build_vector to a INSERTPS.
6385 if (!Subtarget.hasSSE41())
6388 SDValue V2 = Elt.getOperand(0);
6389 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6392 bool CanFold = true;
6393 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6397 SDValue Current = Op->getOperand(i);
6398 SDValue SrcVector = Current->getOperand(0);
6401 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6407 assert(V1.getNode() && "Expected at least two non-zero elements!");
6408 if (V1.getSimpleValueType() != MVT::v4f32)
6409 V1 = DAG.getBitcast(MVT::v4f32, V1);
6410 if (V2.getSimpleValueType() != MVT::v4f32)
6411 V2 = DAG.getBitcast(MVT::v4f32, V2);
6413 // Ok, we can emit an INSERTPS instruction.
6414 unsigned ZMask = Zeroable.to_ulong();
6416 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6417 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6419 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6420 DAG.getIntPtrConstant(InsertPSMask, DL));
6421 return DAG.getBitcast(VT, Result);
6424 /// Return a vector logical shift node.
6425 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6426 SelectionDAG &DAG, const TargetLowering &TLI,
6428 assert(VT.is128BitVector() && "Unknown type for VShift");
6429 MVT ShVT = MVT::v16i8;
6430 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6431 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6432 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6433 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6434 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6435 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6438 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6439 SelectionDAG &DAG) {
6441 // Check if the scalar load can be widened into a vector load. And if
6442 // the address is "base + cst" see if the cst can be "absorbed" into
6443 // the shuffle mask.
6444 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6445 SDValue Ptr = LD->getBasePtr();
6446 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6448 EVT PVT = LD->getValueType(0);
6449 if (PVT != MVT::i32 && PVT != MVT::f32)
6454 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6455 FI = FINode->getIndex();
6457 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6458 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6459 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6460 Offset = Ptr.getConstantOperandVal(1);
6461 Ptr = Ptr.getOperand(0);
6466 // FIXME: 256-bit vector instructions don't require a strict alignment,
6467 // improve this code to support it better.
6468 unsigned RequiredAlign = VT.getSizeInBits()/8;
6469 SDValue Chain = LD->getChain();
6470 // Make sure the stack object alignment is at least 16 or 32.
6471 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6472 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6473 if (MFI.isFixedObjectIndex(FI)) {
6474 // Can't change the alignment. FIXME: It's possible to compute
6475 // the exact stack offset and reference FI + adjust offset instead.
6476 // If someone *really* cares about this. That's the way to implement it.
6479 MFI.setObjectAlignment(FI, RequiredAlign);
6483 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6484 // Ptr + (Offset & ~15).
6487 if ((Offset % RequiredAlign) & 3)
6489 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6492 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6493 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6496 int EltNo = (Offset - StartOffset) >> 2;
6497 unsigned NumElems = VT.getVectorNumElements();
6499 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6500 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6501 LD->getPointerInfo().getWithOffset(StartOffset));
6503 SmallVector<int, 8> Mask(NumElems, EltNo);
6505 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6511 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6512 /// elements can be replaced by a single large load which has the same value as
6513 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6515 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6516 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6517 const SDLoc &DL, SelectionDAG &DAG,
6518 const X86Subtarget &Subtarget,
6519 bool isAfterLegalize) {
6520 unsigned NumElems = Elts.size();
6522 int LastLoadedElt = -1;
6523 SmallBitVector LoadMask(NumElems, false);
6524 SmallBitVector ZeroMask(NumElems, false);
6525 SmallBitVector UndefMask(NumElems, false);
6527 // For each element in the initializer, see if we've found a load, zero or an
6529 for (unsigned i = 0; i < NumElems; ++i) {
6530 SDValue Elt = peekThroughBitcasts(Elts[i]);
6535 UndefMask[i] = true;
6536 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6538 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6541 // Each loaded element must be the correct fractional portion of the
6542 // requested vector load.
6543 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6548 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6549 "Incomplete element masks");
6551 // Handle Special Cases - all undef or undef/zero.
6552 if (UndefMask.count() == NumElems)
6553 return DAG.getUNDEF(VT);
6555 // FIXME: Should we return this as a BUILD_VECTOR instead?
6556 if ((ZeroMask | UndefMask).count() == NumElems)
6557 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6558 : DAG.getConstantFP(0.0, DL, VT);
6560 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6561 int FirstLoadedElt = LoadMask.find_first();
6562 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6563 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6564 EVT LDBaseVT = EltBase.getValueType();
6566 // Consecutive loads can contain UNDEFS but not ZERO elements.
6567 // Consecutive loads with UNDEFs and ZEROs elements require a
6568 // an additional shuffle stage to clear the ZERO elements.
6569 bool IsConsecutiveLoad = true;
6570 bool IsConsecutiveLoadWithZeros = true;
6571 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6573 SDValue Elt = peekThroughBitcasts(Elts[i]);
6574 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6575 if (!DAG.areNonVolatileConsecutiveLoads(
6576 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6577 i - FirstLoadedElt)) {
6578 IsConsecutiveLoad = false;
6579 IsConsecutiveLoadWithZeros = false;
6582 } else if (ZeroMask[i]) {
6583 IsConsecutiveLoad = false;
6587 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6588 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6589 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6590 "Cannot merge volatile loads.");
6592 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6593 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6594 DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6598 // LOAD - all consecutive load/undefs (must start/end with a load).
6599 // If we have found an entire vector of loads and undefs, then return a large
6600 // load of the entire vector width starting at the base pointer.
6601 // If the vector contains zeros, then attempt to shuffle those elements.
6602 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6603 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6604 assert(LDBase && "Did not find base load for merging consecutive loads");
6605 EVT EltVT = LDBase->getValueType(0);
6606 // Ensure that the input vector size for the merged loads matches the
6607 // cumulative size of the input elements.
6608 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6611 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6614 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6615 // will lower to regular temporal loads and use the cache.
6616 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6617 VT.is256BitVector() && !Subtarget.hasInt256())
6620 if (IsConsecutiveLoad)
6621 return CreateLoad(VT, LDBase);
6623 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6624 // vector and a zero vector to clear out the zero elements.
6625 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6626 SmallVector<int, 4> ClearMask(NumElems, -1);
6627 for (unsigned i = 0; i < NumElems; ++i) {
6629 ClearMask[i] = i + NumElems;
6630 else if (LoadMask[i])
6633 SDValue V = CreateLoad(VT, LDBase);
6634 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6635 : DAG.getConstantFP(0.0, DL, VT);
6636 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6641 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6643 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6644 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6645 (LoadSize == 32 || LoadSize == 64) &&
6646 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6647 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6648 : MVT::getIntegerVT(LoadSize);
6649 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6650 if (TLI.isTypeLegal(VecVT)) {
6651 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6652 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6654 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6655 LDBase->getPointerInfo(),
6656 LDBase->getAlignment(),
6657 false/*isVolatile*/, true/*ReadMem*/,
6659 DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6660 return DAG.getBitcast(VT, ResNode);
6667 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6668 unsigned SplatBitSize, LLVMContext &C) {
6669 unsigned ScalarSize = VT.getScalarSizeInBits();
6670 unsigned NumElm = SplatBitSize / ScalarSize;
6672 SmallVector<Constant *, 32> ConstantVec;
6673 for (unsigned i = 0; i < NumElm; i++) {
6674 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6676 if (VT.isFloatingPoint()) {
6677 if (ScalarSize == 32) {
6678 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6680 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6681 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6684 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6685 ConstantVec.push_back(Const);
6687 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6690 static bool isUseOfShuffle(SDNode *N) {
6691 for (auto *U : N->uses()) {
6692 if (isTargetShuffle(U->getOpcode()))
6694 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6695 return isUseOfShuffle(U);
6700 /// Attempt to use the vbroadcast instruction to generate a splat value
6701 /// from a splat BUILD_VECTOR which uses:
6702 /// a. A single scalar load, or a constant.
6703 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6705 /// The VBROADCAST node is returned when a pattern is found,
6706 /// or SDValue() otherwise.
6707 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6708 const X86Subtarget &Subtarget,
6709 SelectionDAG &DAG) {
6710 // VBROADCAST requires AVX.
6711 // TODO: Splats could be generated for non-AVX CPUs using SSE
6712 // instructions, but there's less potential gain for only 128-bit vectors.
6713 if (!Subtarget.hasAVX())
6716 MVT VT = BVOp->getSimpleValueType(0);
6719 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6720 "Unsupported vector type for broadcast.");
6722 BitVector UndefElements;
6723 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6725 // We need a splat of a single value to use broadcast, and it doesn't
6726 // make any sense if the value is only in one element of the vector.
6727 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6728 APInt SplatValue, Undef;
6729 unsigned SplatBitSize;
6731 // Check if this is a repeated constant pattern suitable for broadcasting.
6732 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6733 SplatBitSize > VT.getScalarSizeInBits() &&
6734 SplatBitSize < VT.getSizeInBits()) {
6735 // Avoid replacing with broadcast when it's a use of a shuffle
6736 // instruction to preserve the present custom lowering of shuffles.
6737 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6739 // replace BUILD_VECTOR with broadcast of the repeated constants.
6740 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6741 LLVMContext *Ctx = DAG.getContext();
6742 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6743 if (Subtarget.hasAVX()) {
6744 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6745 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6746 // Splatted value can fit in one INTEGER constant in constant pool.
6747 // Load the constant and broadcast it.
6748 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6749 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6750 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6751 SDValue CP = DAG.getConstantPool(C, PVT);
6752 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6754 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6756 CVT, dl, DAG.getEntryNode(), CP,
6757 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6759 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6760 MVT::getVectorVT(CVT, Repeat), Ld);
6761 return DAG.getBitcast(VT, Brdcst);
6762 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6763 // Splatted value can fit in one FLOAT constant in constant pool.
6764 // Load the constant and broadcast it.
6765 // AVX have support for 32 and 64 bit broadcast for floats only.
6766 // No 64bit integer in 32bit subtarget.
6767 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6768 // Lower the splat via APFloat directly, to avoid any conversion.
6771 ? ConstantFP::get(*Ctx,
6772 APFloat(APFloat::IEEEsingle(), SplatValue))
6773 : ConstantFP::get(*Ctx,
6774 APFloat(APFloat::IEEEdouble(), SplatValue));
6775 SDValue CP = DAG.getConstantPool(C, PVT);
6776 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6778 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6780 CVT, dl, DAG.getEntryNode(), CP,
6781 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6783 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6784 MVT::getVectorVT(CVT, Repeat), Ld);
6785 return DAG.getBitcast(VT, Brdcst);
6786 } else if (SplatBitSize > 64) {
6787 // Load the vector of constants and broadcast it.
6788 MVT CVT = VT.getScalarType();
6789 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6791 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6792 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6793 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6795 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6796 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6798 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6799 return DAG.getBitcast(VT, Brdcst);
6806 bool ConstSplatVal =
6807 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6809 // Make sure that all of the users of a non-constant load are from the
6810 // BUILD_VECTOR node.
6811 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6814 unsigned ScalarSize = Ld.getValueSizeInBits();
6815 bool IsGE256 = (VT.getSizeInBits() >= 256);
6817 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6818 // instruction to save 8 or more bytes of constant pool data.
6819 // TODO: If multiple splats are generated to load the same constant,
6820 // it may be detrimental to overall size. There needs to be a way to detect
6821 // that condition to know if this is truly a size win.
6822 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6824 // Handle broadcasting a single constant scalar from the constant pool
6826 // On Sandybridge (no AVX2), it is still better to load a constant vector
6827 // from the constant pool and not to broadcast it from a scalar.
6828 // But override that restriction when optimizing for size.
6829 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6830 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6831 EVT CVT = Ld.getValueType();
6832 assert(!CVT.isVector() && "Must not broadcast a vector type");
6834 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6835 // For size optimization, also splat v2f64 and v2i64, and for size opt
6836 // with AVX2, also splat i8 and i16.
6837 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6838 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6839 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6840 const Constant *C = nullptr;
6841 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6842 C = CI->getConstantIntValue();
6843 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6844 C = CF->getConstantFPValue();
6846 assert(C && "Invalid constant type");
6848 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6850 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6851 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6853 CVT, dl, DAG.getEntryNode(), CP,
6854 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6857 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6861 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6863 // Handle AVX2 in-register broadcasts.
6864 if (!IsLoad && Subtarget.hasInt256() &&
6865 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6866 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6868 // The scalar source must be a normal load.
6872 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6873 (Subtarget.hasVLX() && ScalarSize == 64))
6874 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6876 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6877 // double since there is no vbroadcastsd xmm
6878 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6879 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6880 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6883 // Unsupported broadcast.
6887 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6888 /// underlying vector and index.
6890 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6892 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6894 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6895 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6898 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6900 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6902 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6903 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6906 // In this case the vector is the extract_subvector expression and the index
6907 // is 2, as specified by the shuffle.
6908 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6909 SDValue ShuffleVec = SVOp->getOperand(0);
6910 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6911 assert(ShuffleVecVT.getVectorElementType() ==
6912 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6914 int ShuffleIdx = SVOp->getMaskElt(Idx);
6915 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6916 ExtractedFromVec = ShuffleVec;
6922 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6923 MVT VT = Op.getSimpleValueType();
6925 // Skip if insert_vec_elt is not supported.
6926 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6927 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6931 unsigned NumElems = Op.getNumOperands();
6935 SmallVector<unsigned, 4> InsertIndices;
6936 SmallVector<int, 8> Mask(NumElems, -1);
6938 for (unsigned i = 0; i != NumElems; ++i) {
6939 unsigned Opc = Op.getOperand(i).getOpcode();
6941 if (Opc == ISD::UNDEF)
6944 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6945 // Quit if more than 1 elements need inserting.
6946 if (InsertIndices.size() > 1)
6949 InsertIndices.push_back(i);
6953 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6954 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6956 // Quit if non-constant index.
6957 if (!isa<ConstantSDNode>(ExtIdx))
6959 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6961 // Quit if extracted from vector of different type.
6962 if (ExtractedFromVec.getValueType() != VT)
6965 if (!VecIn1.getNode())
6966 VecIn1 = ExtractedFromVec;
6967 else if (VecIn1 != ExtractedFromVec) {
6968 if (!VecIn2.getNode())
6969 VecIn2 = ExtractedFromVec;
6970 else if (VecIn2 != ExtractedFromVec)
6971 // Quit if more than 2 vectors to shuffle
6975 if (ExtractedFromVec == VecIn1)
6977 else if (ExtractedFromVec == VecIn2)
6978 Mask[i] = Idx + NumElems;
6981 if (!VecIn1.getNode())
6984 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6985 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6987 for (unsigned Idx : InsertIndices)
6988 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6989 DAG.getIntPtrConstant(Idx, DL));
6994 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
6995 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
6996 Op.getScalarValueSizeInBits() == 1 &&
6997 "Can not convert non-constant vector");
6998 uint64_t Immediate = 0;
6999 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7000 SDValue In = Op.getOperand(idx);
7002 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7005 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7006 return DAG.getConstant(Immediate, dl, VT);
7008 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7010 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7012 MVT VT = Op.getSimpleValueType();
7013 assert((VT.getVectorElementType() == MVT::i1) &&
7014 "Unexpected type in LowerBUILD_VECTORvXi1!");
7017 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7018 return DAG.getTargetConstant(0, dl, VT);
7020 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7021 return DAG.getTargetConstant(1, dl, VT);
7023 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7024 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7025 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7026 return DAG.getBitcast(VT, Imm);
7027 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7028 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7029 DAG.getIntPtrConstant(0, dl));
7032 // Vector has one or more non-const elements
7033 uint64_t Immediate = 0;
7034 SmallVector<unsigned, 16> NonConstIdx;
7035 bool IsSplat = true;
7036 bool HasConstElts = false;
7038 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7039 SDValue In = Op.getOperand(idx);
7042 if (!isa<ConstantSDNode>(In))
7043 NonConstIdx.push_back(idx);
7045 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7046 HasConstElts = true;
7050 else if (In != Op.getOperand(SplatIdx))
7054 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7056 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7057 DAG.getConstant(1, dl, VT),
7058 DAG.getConstant(0, dl, VT));
7060 // insert elements one by one
7064 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7065 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7067 else if (HasConstElts)
7068 Imm = DAG.getConstant(0, dl, VT);
7070 Imm = DAG.getUNDEF(VT);
7071 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7072 DstVec = DAG.getBitcast(VT, Imm);
7074 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7075 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7076 DAG.getIntPtrConstant(0, dl));
7079 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7080 unsigned InsertIdx = NonConstIdx[i];
7081 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7082 Op.getOperand(InsertIdx),
7083 DAG.getIntPtrConstant(InsertIdx, dl));
7088 /// \brief Return true if \p N implements a horizontal binop and return the
7089 /// operands for the horizontal binop into V0 and V1.
7091 /// This is a helper function of LowerToHorizontalOp().
7092 /// This function checks that the build_vector \p N in input implements a
7093 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7094 /// operation to match.
7095 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7096 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7097 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7100 /// This function only analyzes elements of \p N whose indices are
7101 /// in range [BaseIdx, LastIdx).
7102 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7104 unsigned BaseIdx, unsigned LastIdx,
7105 SDValue &V0, SDValue &V1) {
7106 EVT VT = N->getValueType(0);
7108 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7109 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7110 "Invalid Vector in input!");
7112 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7113 bool CanFold = true;
7114 unsigned ExpectedVExtractIdx = BaseIdx;
7115 unsigned NumElts = LastIdx - BaseIdx;
7116 V0 = DAG.getUNDEF(VT);
7117 V1 = DAG.getUNDEF(VT);
7119 // Check if N implements a horizontal binop.
7120 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7121 SDValue Op = N->getOperand(i + BaseIdx);
7124 if (Op->isUndef()) {
7125 // Update the expected vector extract index.
7126 if (i * 2 == NumElts)
7127 ExpectedVExtractIdx = BaseIdx;
7128 ExpectedVExtractIdx += 2;
7132 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7137 SDValue Op0 = Op.getOperand(0);
7138 SDValue Op1 = Op.getOperand(1);
7140 // Try to match the following pattern:
7141 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7142 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7143 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7144 Op0.getOperand(0) == Op1.getOperand(0) &&
7145 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7146 isa<ConstantSDNode>(Op1.getOperand(1)));
7150 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7151 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7153 if (i * 2 < NumElts) {
7155 V0 = Op0.getOperand(0);
7156 if (V0.getValueType() != VT)
7161 V1 = Op0.getOperand(0);
7162 if (V1.getValueType() != VT)
7165 if (i * 2 == NumElts)
7166 ExpectedVExtractIdx = BaseIdx;
7169 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7170 if (I0 == ExpectedVExtractIdx)
7171 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7172 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7173 // Try to match the following dag sequence:
7174 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7175 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7179 ExpectedVExtractIdx += 2;
7185 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7186 /// a concat_vector.
7188 /// This is a helper function of LowerToHorizontalOp().
7189 /// This function expects two 256-bit vectors called V0 and V1.
7190 /// At first, each vector is split into two separate 128-bit vectors.
7191 /// Then, the resulting 128-bit vectors are used to implement two
7192 /// horizontal binary operations.
7194 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7196 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7197 /// the two new horizontal binop.
7198 /// When Mode is set, the first horizontal binop dag node would take as input
7199 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7200 /// horizontal binop dag node would take as input the lower 128-bit of V1
7201 /// and the upper 128-bit of V1.
7203 /// HADD V0_LO, V0_HI
7204 /// HADD V1_LO, V1_HI
7206 /// Otherwise, the first horizontal binop dag node takes as input the lower
7207 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7208 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7210 /// HADD V0_LO, V1_LO
7211 /// HADD V0_HI, V1_HI
7213 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7214 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7215 /// the upper 128-bits of the result.
7216 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7217 const SDLoc &DL, SelectionDAG &DAG,
7218 unsigned X86Opcode, bool Mode,
7219 bool isUndefLO, bool isUndefHI) {
7220 MVT VT = V0.getSimpleValueType();
7221 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7222 "Invalid nodes in input!");
7224 unsigned NumElts = VT.getVectorNumElements();
7225 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7226 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7227 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7228 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7229 MVT NewVT = V0_LO.getSimpleValueType();
7231 SDValue LO = DAG.getUNDEF(NewVT);
7232 SDValue HI = DAG.getUNDEF(NewVT);
7235 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7236 if (!isUndefLO && !V0->isUndef())
7237 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7238 if (!isUndefHI && !V1->isUndef())
7239 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7241 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7242 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7243 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7245 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7246 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7249 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7252 /// Returns true iff \p BV builds a vector with the result equivalent to
7253 /// the result of ADDSUB operation.
7254 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7255 /// are written to the parameters \p Opnd0 and \p Opnd1.
7256 static bool isAddSub(const BuildVectorSDNode *BV,
7257 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7258 SDValue &Opnd0, SDValue &Opnd1) {
7260 MVT VT = BV->getSimpleValueType(0);
7261 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7262 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7263 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7266 unsigned NumElts = VT.getVectorNumElements();
7267 SDValue InVec0 = DAG.getUNDEF(VT);
7268 SDValue InVec1 = DAG.getUNDEF(VT);
7270 // Odd-numbered elements in the input build vector are obtained from
7271 // adding two integer/float elements.
7272 // Even-numbered elements in the input build vector are obtained from
7273 // subtracting two integer/float elements.
7274 unsigned ExpectedOpcode = ISD::FSUB;
7275 unsigned NextExpectedOpcode = ISD::FADD;
7276 bool AddFound = false;
7277 bool SubFound = false;
7279 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7280 SDValue Op = BV->getOperand(i);
7282 // Skip 'undef' values.
7283 unsigned Opcode = Op.getOpcode();
7284 if (Opcode == ISD::UNDEF) {
7285 std::swap(ExpectedOpcode, NextExpectedOpcode);
7289 // Early exit if we found an unexpected opcode.
7290 if (Opcode != ExpectedOpcode)
7293 SDValue Op0 = Op.getOperand(0);
7294 SDValue Op1 = Op.getOperand(1);
7296 // Try to match the following pattern:
7297 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7298 // Early exit if we cannot match that sequence.
7299 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7300 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7301 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7302 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7303 Op0.getOperand(1) != Op1.getOperand(1))
7306 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7310 // We found a valid add/sub node. Update the information accordingly.
7316 // Update InVec0 and InVec1.
7317 if (InVec0.isUndef()) {
7318 InVec0 = Op0.getOperand(0);
7319 if (InVec0.getSimpleValueType() != VT)
7322 if (InVec1.isUndef()) {
7323 InVec1 = Op1.getOperand(0);
7324 if (InVec1.getSimpleValueType() != VT)
7328 // Make sure that operands in input to each add/sub node always
7329 // come from a same pair of vectors.
7330 if (InVec0 != Op0.getOperand(0)) {
7331 if (ExpectedOpcode == ISD::FSUB)
7334 // FADD is commutable. Try to commute the operands
7335 // and then test again.
7336 std::swap(Op0, Op1);
7337 if (InVec0 != Op0.getOperand(0))
7341 if (InVec1 != Op1.getOperand(0))
7344 // Update the pair of expected opcodes.
7345 std::swap(ExpectedOpcode, NextExpectedOpcode);
7348 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7349 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7357 /// Returns true if is possible to fold MUL and an idiom that has already been
7358 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7359 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7360 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7362 /// Prior to calling this function it should be known that there is some
7363 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7364 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7365 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7366 /// of \p Opnd0 uses is expected to be equal to 2.
7367 /// For example, this function may be called for the following IR:
7368 /// %AB = fmul fast <2 x double> %A, %B
7369 /// %Sub = fsub fast <2 x double> %AB, %C
7370 /// %Add = fadd fast <2 x double> %AB, %C
7371 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7372 /// <2 x i32> <i32 0, i32 3>
7373 /// There is a def for %Addsub here, which potentially can be replaced by
7374 /// X86ISD::ADDSUB operation:
7375 /// %Addsub = X86ISD::ADDSUB %AB, %C
7376 /// and such ADDSUB can further be replaced with FMADDSUB:
7377 /// %Addsub = FMADDSUB %A, %B, %C.
7379 /// The main reason why this method is called before the replacement of the
7380 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7381 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7383 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7384 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7385 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7386 !Subtarget.hasAnyFMA())
7389 // FIXME: These checks must match the similar ones in
7390 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7391 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7392 // or MUL + ADDSUB to FMADDSUB.
7393 const TargetOptions &Options = DAG.getTarget().Options;
7395 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7400 Opnd1 = Opnd0.getOperand(1);
7401 Opnd0 = Opnd0.getOperand(0);
7406 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7407 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7408 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7409 const X86Subtarget &Subtarget,
7410 SelectionDAG &DAG) {
7411 SDValue Opnd0, Opnd1;
7412 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7415 MVT VT = BV->getSimpleValueType(0);
7418 // Try to generate X86ISD::FMADDSUB node here.
7420 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7421 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7423 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7424 // the ADDSUB idiom has been successfully recognized. There are no known
7425 // X86 targets with 512-bit ADDSUB instructions!
7426 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7428 if (VT.is512BitVector())
7431 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7434 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7435 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7436 const X86Subtarget &Subtarget,
7437 SelectionDAG &DAG) {
7438 MVT VT = BV->getSimpleValueType(0);
7439 unsigned NumElts = VT.getVectorNumElements();
7440 unsigned NumUndefsLO = 0;
7441 unsigned NumUndefsHI = 0;
7442 unsigned Half = NumElts/2;
7444 // Count the number of UNDEF operands in the build_vector in input.
7445 for (unsigned i = 0, e = Half; i != e; ++i)
7446 if (BV->getOperand(i)->isUndef())
7449 for (unsigned i = Half, e = NumElts; i != e; ++i)
7450 if (BV->getOperand(i)->isUndef())
7453 // Early exit if this is either a build_vector of all UNDEFs or all the
7454 // operands but one are UNDEF.
7455 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7459 SDValue InVec0, InVec1;
7460 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7461 // Try to match an SSE3 float HADD/HSUB.
7462 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7463 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7465 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7466 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7467 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7468 // Try to match an SSSE3 integer HADD/HSUB.
7469 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7470 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7472 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7473 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7476 if (!Subtarget.hasAVX())
7479 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7480 // Try to match an AVX horizontal add/sub of packed single/double
7481 // precision floating point values from 256-bit vectors.
7482 SDValue InVec2, InVec3;
7483 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7484 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7485 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7486 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7487 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7489 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7490 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7491 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7492 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7493 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7494 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7495 // Try to match an AVX2 horizontal add/sub of signed integers.
7496 SDValue InVec2, InVec3;
7498 bool CanFold = true;
7500 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7501 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7502 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7503 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7504 X86Opcode = X86ISD::HADD;
7505 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7506 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7507 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7508 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7509 X86Opcode = X86ISD::HSUB;
7514 // Fold this build_vector into a single horizontal add/sub.
7515 // Do this only if the target has AVX2.
7516 if (Subtarget.hasAVX2())
7517 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7519 // Do not try to expand this build_vector into a pair of horizontal
7520 // add/sub if we can emit a pair of scalar add/sub.
7521 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7524 // Convert this build_vector into a pair of horizontal binop followed by
7526 bool isUndefLO = NumUndefsLO == Half;
7527 bool isUndefHI = NumUndefsHI == Half;
7528 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7529 isUndefLO, isUndefHI);
7533 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7534 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7536 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7537 X86Opcode = X86ISD::HADD;
7538 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7539 X86Opcode = X86ISD::HSUB;
7540 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7541 X86Opcode = X86ISD::FHADD;
7542 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7543 X86Opcode = X86ISD::FHSUB;
7547 // Don't try to expand this build_vector into a pair of horizontal add/sub
7548 // if we can simply emit a pair of scalar add/sub.
7549 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7552 // Convert this build_vector into two horizontal add/sub followed by
7554 bool isUndefLO = NumUndefsLO == Half;
7555 bool isUndefHI = NumUndefsHI == Half;
7556 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7557 isUndefLO, isUndefHI);
7563 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7564 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7565 /// just apply the bit to the vectors.
7566 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7567 /// from this, but enough scalar bit operations are created from the later
7568 /// legalization + scalarization stages to need basic support.
7569 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7570 SelectionDAG &DAG) {
7572 MVT VT = Op->getSimpleValueType(0);
7573 unsigned NumElems = VT.getVectorNumElements();
7574 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7576 // Check that all elements have the same opcode.
7577 // TODO: Should we allow UNDEFS and if so how many?
7578 unsigned Opcode = Op->getOperand(0).getOpcode();
7579 for (unsigned i = 1; i < NumElems; ++i)
7580 if (Opcode != Op->getOperand(i).getOpcode())
7583 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7590 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7595 SmallVector<SDValue, 4> LHSElts, RHSElts;
7596 for (SDValue Elt : Op->ops()) {
7597 SDValue LHS = Elt.getOperand(0);
7598 SDValue RHS = Elt.getOperand(1);
7600 // We expect the canonicalized RHS operand to be the constant.
7601 if (!isa<ConstantSDNode>(RHS))
7603 LHSElts.push_back(LHS);
7604 RHSElts.push_back(RHS);
7607 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7608 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7609 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7612 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7613 /// functionality to do this, so it's all zeros, all ones, or some derivation
7614 /// that is cheap to calculate.
7615 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7616 const X86Subtarget &Subtarget) {
7618 MVT VT = Op.getSimpleValueType();
7620 // Vectors containing all zeros can be matched by pxor and xorps.
7621 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7622 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7623 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7624 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7627 return getZeroVector(VT, Subtarget, DAG, DL);
7630 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7631 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7632 // vpcmpeqd on 256-bit vectors.
7633 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7634 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7635 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7638 return getOnesVector(VT, DAG, DL);
7645 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7648 MVT VT = Op.getSimpleValueType();
7649 MVT ExtVT = VT.getVectorElementType();
7650 unsigned NumElems = Op.getNumOperands();
7652 // Generate vectors for predicate vectors.
7653 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7654 return LowerBUILD_VECTORvXi1(Op, DAG);
7656 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7657 return VectorConstant;
7659 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7660 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7662 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7663 return HorizontalOp;
7664 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7666 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7669 unsigned EVTBits = ExtVT.getSizeInBits();
7671 unsigned NumZero = 0;
7672 unsigned NumNonZero = 0;
7673 uint64_t NonZeros = 0;
7674 bool IsAllConstants = true;
7675 SmallSet<SDValue, 8> Values;
7676 for (unsigned i = 0; i < NumElems; ++i) {
7677 SDValue Elt = Op.getOperand(i);
7681 if (Elt.getOpcode() != ISD::Constant &&
7682 Elt.getOpcode() != ISD::ConstantFP)
7683 IsAllConstants = false;
7684 if (X86::isZeroNode(Elt))
7687 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7688 NonZeros |= ((uint64_t)1 << i);
7693 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7694 if (NumNonZero == 0)
7695 return DAG.getUNDEF(VT);
7697 // Special case for single non-zero, non-undef, element.
7698 if (NumNonZero == 1) {
7699 unsigned Idx = countTrailingZeros(NonZeros);
7700 SDValue Item = Op.getOperand(Idx);
7702 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7703 // the value are obviously zero, truncate the value to i32 and do the
7704 // insertion that way. Only do this if the value is non-constant or if the
7705 // value is a constant being inserted into element 0. It is cheaper to do
7706 // a constant pool load than it is to do a movd + shuffle.
7707 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7708 (!IsAllConstants || Idx == 0)) {
7709 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7711 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7712 MVT VecVT = MVT::v4i32;
7714 // Truncate the value (which may itself be a constant) to i32, and
7715 // convert it to a vector with movd (S2V+shuffle to zero extend).
7716 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7717 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7718 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7719 Item, Idx * 2, true, Subtarget, DAG));
7723 // If we have a constant or non-constant insertion into the low element of
7724 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7725 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7726 // depending on what the source datatype is.
7729 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7731 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7732 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7733 assert((VT.is128BitVector() || VT.is256BitVector() ||
7734 VT.is512BitVector()) &&
7735 "Expected an SSE value type!");
7736 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7737 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7738 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7741 // We can't directly insert an i8 or i16 into a vector, so zero extend
7743 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7744 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7745 if (VT.getSizeInBits() >= 256) {
7746 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7747 if (Subtarget.hasAVX()) {
7748 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7749 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7751 // Without AVX, we need to extend to a 128-bit vector and then
7752 // insert into the 256-bit vector.
7753 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7754 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7755 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7758 assert(VT.is128BitVector() && "Expected an SSE value type!");
7759 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7760 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7762 return DAG.getBitcast(VT, Item);
7766 // Is it a vector logical left shift?
7767 if (NumElems == 2 && Idx == 1 &&
7768 X86::isZeroNode(Op.getOperand(0)) &&
7769 !X86::isZeroNode(Op.getOperand(1))) {
7770 unsigned NumBits = VT.getSizeInBits();
7771 return getVShift(true, VT,
7772 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7773 VT, Op.getOperand(1)),
7774 NumBits/2, DAG, *this, dl);
7777 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7780 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7781 // is a non-constant being inserted into an element other than the low one,
7782 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7783 // movd/movss) to move this into the low element, then shuffle it into
7785 if (EVTBits == 32) {
7786 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7787 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7791 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7792 if (Values.size() == 1) {
7793 if (EVTBits == 32) {
7794 // Instead of a shuffle like this:
7795 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7796 // Check if it's possible to issue this instead.
7797 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7798 unsigned Idx = countTrailingZeros(NonZeros);
7799 SDValue Item = Op.getOperand(Idx);
7800 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7801 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7806 // A vector full of immediates; various special cases are already
7807 // handled, so this is best done with a single constant-pool load.
7811 // See if we can use a vector load to get all of the elements.
7812 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7813 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7815 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7819 // For AVX-length vectors, build the individual 128-bit pieces and use
7820 // shuffles to put them in place.
7821 if (VT.is256BitVector() || VT.is512BitVector()) {
7822 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7824 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7826 // Build both the lower and upper subvector.
7828 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7829 SDValue Upper = DAG.getBuildVector(
7830 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7832 // Recreate the wider vector with the lower and upper part.
7833 if (VT.is256BitVector())
7834 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7835 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7838 // Let legalizer expand 2-wide build_vectors.
7839 if (EVTBits == 64) {
7840 if (NumNonZero == 1) {
7841 // One half is zero or undef.
7842 unsigned Idx = countTrailingZeros(NonZeros);
7843 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7844 Op.getOperand(Idx));
7845 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7850 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7851 if (EVTBits == 8 && NumElems == 16)
7852 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7856 if (EVTBits == 16 && NumElems == 8)
7857 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7861 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7862 if (EVTBits == 32 && NumElems == 4)
7863 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7866 // If element VT is == 32 bits, turn it into a number of shuffles.
7867 if (NumElems == 4 && NumZero > 0) {
7868 SmallVector<SDValue, 8> Ops(NumElems);
7869 for (unsigned i = 0; i < 4; ++i) {
7870 bool isZero = !(NonZeros & (1ULL << i));
7872 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7874 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7877 for (unsigned i = 0; i < 2; ++i) {
7878 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7881 Ops[i] = Ops[i*2]; // Must be a zero vector.
7884 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7887 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7890 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7895 bool Reverse1 = (NonZeros & 0x3) == 2;
7896 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7900 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7901 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7903 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7906 if (Values.size() > 1 && VT.is128BitVector()) {
7907 // Check for a build vector from mostly shuffle plus few inserting.
7908 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7911 // For SSE 4.1, use insertps to put the high elements into the low element.
7912 if (Subtarget.hasSSE41()) {
7914 if (!Op.getOperand(0).isUndef())
7915 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7917 Result = DAG.getUNDEF(VT);
7919 for (unsigned i = 1; i < NumElems; ++i) {
7920 if (Op.getOperand(i).isUndef()) continue;
7921 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7922 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7927 // Otherwise, expand into a number of unpckl*, start by extending each of
7928 // our (non-undef) elements to the full vector width with the element in the
7929 // bottom slot of the vector (which generates no code for SSE).
7930 SmallVector<SDValue, 8> Ops(NumElems);
7931 for (unsigned i = 0; i < NumElems; ++i) {
7932 if (!Op.getOperand(i).isUndef())
7933 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7935 Ops[i] = DAG.getUNDEF(VT);
7938 // Next, we iteratively mix elements, e.g. for v4f32:
7939 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7940 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7941 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
7942 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
7943 // Generate scaled UNPCKL shuffle mask.
7944 SmallVector<int, 16> Mask;
7945 for(unsigned i = 0; i != Scale; ++i)
7947 for (unsigned i = 0; i != Scale; ++i)
7948 Mask.push_back(NumElems+i);
7949 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7951 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
7952 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
7959 // 256-bit AVX can use the vinsertf128 instruction
7960 // to create 256-bit vectors from two other 128-bit ones.
7961 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7963 MVT ResVT = Op.getSimpleValueType();
7965 assert((ResVT.is256BitVector() ||
7966 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7968 SDValue V1 = Op.getOperand(0);
7969 SDValue V2 = Op.getOperand(1);
7970 unsigned NumElems = ResVT.getVectorNumElements();
7971 if (ResVT.is256BitVector())
7972 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7974 if (Op.getNumOperands() == 4) {
7975 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7976 ResVT.getVectorNumElements()/2);
7977 SDValue V3 = Op.getOperand(2);
7978 SDValue V4 = Op.getOperand(3);
7979 return concat256BitVectors(
7980 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7981 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
7984 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7987 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
7988 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
7989 static bool isExpandWithZeros(const SDValue &Op) {
7990 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
7991 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
7993 for (unsigned i = 1; i < Op.getNumOperands(); i++)
7994 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8000 // Returns true if the given node is a type promotion (by concatenating i1
8001 // zeros) of the result of a node that already zeros all upper bits of
8003 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8004 unsigned Opc = Op.getOpcode();
8006 assert(Opc == ISD::CONCAT_VECTORS &&
8007 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8008 "Unexpected node to check for type promotion!");
8010 // As long as we are concatenating zeros to the upper part of a previous node
8011 // result, climb up the tree until a node with different opcode is
8013 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8014 if (Opc == ISD::INSERT_SUBVECTOR) {
8015 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8016 Op.getConstantOperandVal(2) == 0)
8017 Op = Op.getOperand(1);
8020 } else { // Opc == ISD::CONCAT_VECTORS
8021 if (isExpandWithZeros(Op))
8022 Op = Op.getOperand(0);
8026 Opc = Op.getOpcode();
8029 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8030 // of a node that zeros the upper bits (its masked version).
8031 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8032 (Op.getOpcode() == ISD::AND &&
8033 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8034 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8041 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8042 const X86Subtarget &Subtarget,
8043 SelectionDAG & DAG) {
8045 MVT ResVT = Op.getSimpleValueType();
8046 unsigned NumOfOperands = Op.getNumOperands();
8048 assert(isPowerOf2_32(NumOfOperands) &&
8049 "Unexpected number of operands in CONCAT_VECTORS");
8051 // If this node promotes - by concatenating zeroes - the type of the result
8052 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8053 // output register, mark it as legal and catch the pattern in instruction
8054 // selection to avoid emitting extra insturctions (for zeroing upper bits).
8055 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8056 SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
8057 SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
8058 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8062 SDValue Undef = DAG.getUNDEF(ResVT);
8063 if (NumOfOperands > 2) {
8064 // Specialize the cases when all, or all but one, of the operands are undef.
8065 unsigned NumOfDefinedOps = 0;
8067 for (unsigned i = 0; i < NumOfOperands; i++)
8068 if (!Op.getOperand(i).isUndef()) {
8072 if (NumOfDefinedOps == 0)
8074 if (NumOfDefinedOps == 1) {
8075 unsigned SubVecNumElts =
8076 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8077 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8078 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8079 Op.getOperand(OpIdx), IdxVal);
8082 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8083 ResVT.getVectorNumElements()/2);
8084 SmallVector<SDValue, 2> Ops;
8085 for (unsigned i = 0; i < NumOfOperands/2; i++)
8086 Ops.push_back(Op.getOperand(i));
8087 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8089 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8090 Ops.push_back(Op.getOperand(i));
8091 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8092 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8096 SDValue V1 = Op.getOperand(0);
8097 SDValue V2 = Op.getOperand(1);
8098 unsigned NumElems = ResVT.getVectorNumElements();
8099 assert(V1.getValueType() == V2.getValueType() &&
8100 V1.getValueType().getVectorNumElements() == NumElems/2 &&
8101 "Unexpected operands in CONCAT_VECTORS");
8103 if (ResVT.getSizeInBits() >= 16)
8104 return Op; // The operation is legal with KUNPCK
8106 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8107 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8108 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8109 if (IsZeroV1 && IsZeroV2)
8112 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8114 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8116 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8118 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8120 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8123 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8125 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8126 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8129 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8130 const X86Subtarget &Subtarget,
8131 SelectionDAG &DAG) {
8132 MVT VT = Op.getSimpleValueType();
8133 if (VT.getVectorElementType() == MVT::i1)
8134 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8136 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8137 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8138 Op.getNumOperands() == 4)));
8140 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8141 // from two other 128-bit ones.
8143 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8144 return LowerAVXCONCAT_VECTORS(Op, DAG);
8147 //===----------------------------------------------------------------------===//
8148 // Vector shuffle lowering
8150 // This is an experimental code path for lowering vector shuffles on x86. It is
8151 // designed to handle arbitrary vector shuffles and blends, gracefully
8152 // degrading performance as necessary. It works hard to recognize idiomatic
8153 // shuffles and lower them to optimal instruction patterns without leaving
8154 // a framework that allows reasonably efficient handling of all vector shuffle
8156 //===----------------------------------------------------------------------===//
8158 /// \brief Tiny helper function to identify a no-op mask.
8160 /// This is a somewhat boring predicate function. It checks whether the mask
8161 /// array input, which is assumed to be a single-input shuffle mask of the kind
8162 /// used by the X86 shuffle instructions (not a fully general
8163 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8164 /// in-place shuffle are 'no-op's.
8165 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8166 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8167 assert(Mask[i] >= -1 && "Out of bound mask element!");
8168 if (Mask[i] >= 0 && Mask[i] != i)
8174 /// \brief Test whether there are elements crossing 128-bit lanes in this
8177 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8178 /// and we routinely test for these.
8179 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8180 int LaneSize = 128 / VT.getScalarSizeInBits();
8181 int Size = Mask.size();
8182 for (int i = 0; i < Size; ++i)
8183 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8188 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8190 /// This checks a shuffle mask to see if it is performing the same
8191 /// lane-relative shuffle in each sub-lane. This trivially implies
8192 /// that it is also not lane-crossing. It may however involve a blend from the
8193 /// same lane of a second vector.
8195 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8196 /// non-trivial to compute in the face of undef lanes. The representation is
8197 /// suitable for use with existing 128-bit shuffles as entries from the second
8198 /// vector have been remapped to [LaneSize, 2*LaneSize).
8199 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8201 SmallVectorImpl<int> &RepeatedMask) {
8202 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8203 RepeatedMask.assign(LaneSize, -1);
8204 int Size = Mask.size();
8205 for (int i = 0; i < Size; ++i) {
8206 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8209 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8210 // This entry crosses lanes, so there is no way to model this shuffle.
8213 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8214 // Adjust second vector indices to start at LaneSize instead of Size.
8215 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8216 : Mask[i] % LaneSize + LaneSize;
8217 if (RepeatedMask[i % LaneSize] < 0)
8218 // This is the first non-undef entry in this slot of a 128-bit lane.
8219 RepeatedMask[i % LaneSize] = LocalM;
8220 else if (RepeatedMask[i % LaneSize] != LocalM)
8221 // Found a mismatch with the repeated mask.
8227 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8229 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8230 SmallVectorImpl<int> &RepeatedMask) {
8231 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8234 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8236 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8237 SmallVectorImpl<int> &RepeatedMask) {
8238 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8241 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8242 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8243 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8245 SmallVectorImpl<int> &RepeatedMask) {
8246 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8247 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8248 int Size = Mask.size();
8249 for (int i = 0; i < Size; ++i) {
8250 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8251 if (Mask[i] == SM_SentinelUndef)
8253 if (Mask[i] == SM_SentinelZero) {
8254 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8256 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8259 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8260 // This entry crosses lanes, so there is no way to model this shuffle.
8263 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8264 // Adjust second vector indices to start at LaneSize instead of Size.
8266 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8267 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8268 // This is the first non-undef entry in this slot of a 128-bit lane.
8269 RepeatedMask[i % LaneSize] = LocalM;
8270 else if (RepeatedMask[i % LaneSize] != LocalM)
8271 // Found a mismatch with the repeated mask.
8277 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8280 /// This is a fast way to test a shuffle mask against a fixed pattern:
8282 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8284 /// It returns true if the mask is exactly as wide as the argument list, and
8285 /// each element of the mask is either -1 (signifying undef) or the value given
8286 /// in the argument.
8287 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8288 ArrayRef<int> ExpectedMask) {
8289 if (Mask.size() != ExpectedMask.size())
8292 int Size = Mask.size();
8294 // If the values are build vectors, we can look through them to find
8295 // equivalent inputs that make the shuffles equivalent.
8296 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8297 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8299 for (int i = 0; i < Size; ++i) {
8300 assert(Mask[i] >= -1 && "Out of bound mask element!");
8301 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8302 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8303 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8304 if (!MaskBV || !ExpectedBV ||
8305 MaskBV->getOperand(Mask[i] % Size) !=
8306 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8314 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8316 /// The masks must be exactly the same width.
8318 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8319 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8321 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8322 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8323 ArrayRef<int> ExpectedMask) {
8324 int Size = Mask.size();
8325 if (Size != (int)ExpectedMask.size())
8328 for (int i = 0; i < Size; ++i)
8329 if (Mask[i] == SM_SentinelUndef)
8331 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8333 else if (Mask[i] != ExpectedMask[i])
8339 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8341 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8342 const APInt &Zeroable) {
8343 int NumElts = Mask.size();
8344 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8346 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8347 for (int i = 0; i != NumElts; ++i) {
8349 if (M == SM_SentinelUndef)
8351 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8352 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8357 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8359 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8360 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8363 SmallVector<int, 8> Unpcklwd;
8364 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8365 /* Unary = */ false);
8366 SmallVector<int, 8> Unpckhwd;
8367 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8368 /* Unary = */ false);
8369 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8370 isTargetShuffleEquivalent(Mask, Unpckhwd));
8371 return IsUnpackwdMask;
8374 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8376 /// This helper function produces an 8-bit shuffle immediate corresponding to
8377 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8378 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8381 /// NB: We rely heavily on "undef" masks preserving the input lane.
8382 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8383 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8384 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8385 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8386 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8387 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8390 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8391 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8392 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8393 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8397 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8398 SelectionDAG &DAG) {
8399 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8402 /// \brief Compute whether each element of a shuffle is zeroable.
8404 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8405 /// Either it is an undef element in the shuffle mask, the element of the input
8406 /// referenced is undef, or the element of the input referenced is known to be
8407 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8408 /// as many lanes with this technique as possible to simplify the remaining
8410 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8411 SDValue V1, SDValue V2) {
8412 APInt Zeroable(Mask.size(), 0);
8413 V1 = peekThroughBitcasts(V1);
8414 V2 = peekThroughBitcasts(V2);
8416 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8417 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8419 int VectorSizeInBits = V1.getValueSizeInBits();
8420 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8421 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8423 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8425 // Handle the easy cases.
8426 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8431 // Determine shuffle input and normalize the mask.
8432 SDValue V = M < Size ? V1 : V2;
8435 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8436 if (V.getOpcode() != ISD::BUILD_VECTOR)
8439 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8440 // the (larger) source element must be UNDEF/ZERO.
8441 if ((Size % V.getNumOperands()) == 0) {
8442 int Scale = Size / V->getNumOperands();
8443 SDValue Op = V.getOperand(M / Scale);
8444 if (Op.isUndef() || X86::isZeroNode(Op))
8446 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8447 APInt Val = Cst->getAPIntValue();
8448 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8449 Val = Val.getLoBits(ScalarSizeInBits);
8452 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8453 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8454 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8455 Val = Val.getLoBits(ScalarSizeInBits);
8462 // If the BUILD_VECTOR has more elements then all the (smaller) source
8463 // elements must be UNDEF or ZERO.
8464 if ((V.getNumOperands() % Size) == 0) {
8465 int Scale = V->getNumOperands() / Size;
8466 bool AllZeroable = true;
8467 for (int j = 0; j < Scale; ++j) {
8468 SDValue Op = V.getOperand((M * Scale) + j);
8469 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8480 // The Shuffle result is as follow:
8481 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8482 // Each Zeroable's element correspond to a particular Mask's element.
8483 // As described in computeZeroableShuffleElements function.
8485 // The function looks for a sub-mask that the nonzero elements are in
8486 // increasing order. If such sub-mask exist. The function returns true.
8487 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8488 ArrayRef<int> Mask, const EVT &VectorType,
8489 bool &IsZeroSideLeft) {
8490 int NextElement = -1;
8491 // Check if the Mask's nonzero elements are in increasing order.
8492 for (int i = 0, e = Mask.size(); i < e; i++) {
8493 // Checks if the mask's zeros elements are built from only zeros.
8494 assert(Mask[i] >= -1 && "Out of bound mask element!");
8499 // Find the lowest non zero element
8500 if (NextElement < 0) {
8501 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8502 IsZeroSideLeft = NextElement != 0;
8504 // Exit if the mask's non zero elements are not in increasing order.
8505 if (NextElement != Mask[i])
8512 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8513 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8514 ArrayRef<int> Mask, SDValue V1,
8516 const APInt &Zeroable,
8517 const X86Subtarget &Subtarget,
8518 SelectionDAG &DAG) {
8519 int Size = Mask.size();
8520 int LaneSize = 128 / VT.getScalarSizeInBits();
8521 const int NumBytes = VT.getSizeInBits() / 8;
8522 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8524 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8525 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8526 (Subtarget.hasBWI() && VT.is512BitVector()));
8528 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8529 // Sign bit set in i8 mask means zero element.
8530 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8533 for (int i = 0; i < NumBytes; ++i) {
8534 int M = Mask[i / NumEltBytes];
8536 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8539 if (Zeroable[i / NumEltBytes]) {
8540 PSHUFBMask[i] = ZeroMask;
8544 // We can only use a single input of V1 or V2.
8545 SDValue SrcV = (M >= Size ? V2 : V1);
8551 // PSHUFB can't cross lanes, ensure this doesn't happen.
8552 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8556 M = M * NumEltBytes + (i % NumEltBytes);
8557 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8559 assert(V && "Failed to find a source input");
8561 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8562 return DAG.getBitcast(
8563 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8564 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8567 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8568 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8571 // X86 has dedicated shuffle that can be lowered to VEXPAND
8572 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8573 const APInt &Zeroable,
8574 ArrayRef<int> Mask, SDValue &V1,
8575 SDValue &V2, SelectionDAG &DAG,
8576 const X86Subtarget &Subtarget) {
8577 bool IsLeftZeroSide = true;
8578 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8581 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8583 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8584 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8585 unsigned NumElts = VT.getVectorNumElements();
8586 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8587 "Unexpected number of vector elements");
8588 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8589 Subtarget, DAG, DL);
8590 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8591 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8592 return DAG.getSelect(DL, VT, VMask,
8593 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8597 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8598 unsigned &UnpackOpcode, bool IsUnary,
8599 ArrayRef<int> TargetMask, SDLoc &DL,
8601 const X86Subtarget &Subtarget) {
8602 int NumElts = VT.getVectorNumElements();
8604 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8605 for (int i = 0; i != NumElts; i += 2) {
8606 int M1 = TargetMask[i + 0];
8607 int M2 = TargetMask[i + 1];
8608 Undef1 &= (SM_SentinelUndef == M1);
8609 Undef2 &= (SM_SentinelUndef == M2);
8610 Zero1 &= isUndefOrZero(M1);
8611 Zero2 &= isUndefOrZero(M2);
8613 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8614 "Zeroable shuffle detected");
8616 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8617 SmallVector<int, 64> Unpckl, Unpckh;
8618 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8619 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8620 UnpackOpcode = X86ISD::UNPCKL;
8621 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8622 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8626 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8627 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8628 UnpackOpcode = X86ISD::UNPCKH;
8629 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8630 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8634 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8635 if (IsUnary && (Zero1 || Zero2)) {
8636 // Don't bother if we can blend instead.
8637 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8638 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8641 bool MatchLo = true, MatchHi = true;
8642 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8643 int M = TargetMask[i];
8645 // Ignore if the input is known to be zero or the index is undef.
8646 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8647 (M == SM_SentinelUndef))
8650 MatchLo &= (M == Unpckl[i]);
8651 MatchHi &= (M == Unpckh[i]);
8654 if (MatchLo || MatchHi) {
8655 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8656 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8657 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8662 // If a binary shuffle, commute and try again.
8664 ShuffleVectorSDNode::commuteMask(Unpckl);
8665 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8666 UnpackOpcode = X86ISD::UNPCKL;
8671 ShuffleVectorSDNode::commuteMask(Unpckh);
8672 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8673 UnpackOpcode = X86ISD::UNPCKH;
8682 // X86 has dedicated unpack instructions that can handle specific blend
8683 // operations: UNPCKH and UNPCKL.
8684 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8685 ArrayRef<int> Mask, SDValue V1,
8686 SDValue V2, SelectionDAG &DAG) {
8687 SmallVector<int, 8> Unpckl;
8688 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8689 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8690 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8692 SmallVector<int, 8> Unpckh;
8693 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8694 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8695 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8697 // Commute and try again.
8698 ShuffleVectorSDNode::commuteMask(Unpckl);
8699 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8700 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8702 ShuffleVectorSDNode::commuteMask(Unpckh);
8703 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8704 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8709 /// \brief Try to emit a bitmask instruction for a shuffle.
8711 /// This handles cases where we can model a blend exactly as a bitmask due to
8712 /// one of the inputs being zeroable.
8713 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8714 SDValue V2, ArrayRef<int> Mask,
8715 const APInt &Zeroable,
8716 SelectionDAG &DAG) {
8717 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8718 MVT EltVT = VT.getVectorElementType();
8719 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8720 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8721 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8723 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8726 if (Mask[i] % Size != i)
8727 return SDValue(); // Not a blend.
8729 V = Mask[i] < Size ? V1 : V2;
8730 else if (V != (Mask[i] < Size ? V1 : V2))
8731 return SDValue(); // Can only let one input through the mask.
8733 VMaskOps[i] = AllOnes;
8736 return SDValue(); // No non-zeroable elements!
8738 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8739 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8742 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8744 /// This is used as a fallback approach when first class blend instructions are
8745 /// unavailable. Currently it is only suitable for integer vectors, but could
8746 /// be generalized for floating point vectors if desirable.
8747 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8748 SDValue V2, ArrayRef<int> Mask,
8749 SelectionDAG &DAG) {
8750 assert(VT.isInteger() && "Only supports integer vector types!");
8751 MVT EltVT = VT.getVectorElementType();
8752 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8753 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8754 SmallVector<SDValue, 16> MaskOps;
8755 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8756 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8757 return SDValue(); // Shuffled input!
8758 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8761 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8762 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8763 // We have to cast V2 around.
8764 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8765 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8766 DAG.getBitcast(MaskVT, V1Mask),
8767 DAG.getBitcast(MaskVT, V2)));
8768 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8771 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8772 SDValue PreservedSrc,
8773 const X86Subtarget &Subtarget,
8776 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8777 MutableArrayRef<int> TargetMask,
8778 bool &ForceV1Zero, bool &ForceV2Zero,
8779 uint64_t &BlendMask) {
8780 bool V1IsZeroOrUndef =
8781 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8782 bool V2IsZeroOrUndef =
8783 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8786 ForceV1Zero = false, ForceV2Zero = false;
8787 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8789 // Attempt to generate the binary blend mask. If an input is zero then
8790 // we can use any lane.
8791 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8792 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8793 int M = TargetMask[i];
8794 if (M == SM_SentinelUndef)
8798 if (M == i + Size) {
8799 BlendMask |= 1ull << i;
8802 if (M == SM_SentinelZero) {
8803 if (V1IsZeroOrUndef) {
8808 if (V2IsZeroOrUndef) {
8810 BlendMask |= 1ull << i;
8811 TargetMask[i] = i + Size;
8820 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8821 uint64_t ScaledMask = 0;
8822 for (int i = 0; i != Size; ++i)
8823 if (BlendMask & (1ull << i))
8824 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8828 /// \brief Try to emit a blend instruction for a shuffle.
8830 /// This doesn't do any checks for the availability of instructions for blending
8831 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8832 /// be matched in the backend with the type given. What it does check for is
8833 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8834 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8835 SDValue V2, ArrayRef<int> Original,
8836 const APInt &Zeroable,
8837 const X86Subtarget &Subtarget,
8838 SelectionDAG &DAG) {
8839 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8841 uint64_t BlendMask = 0;
8842 bool ForceV1Zero = false, ForceV2Zero = false;
8843 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8847 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8849 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8851 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8853 switch (VT.SimpleTy) {
8858 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8859 DAG.getConstant(BlendMask, DL, MVT::i8));
8863 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8867 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8868 // that instruction.
8869 if (Subtarget.hasAVX2()) {
8870 // Scale the blend by the number of 32-bit dwords per element.
8871 int Scale = VT.getScalarSizeInBits() / 32;
8872 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8873 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8874 V1 = DAG.getBitcast(BlendVT, V1);
8875 V2 = DAG.getBitcast(BlendVT, V2);
8876 return DAG.getBitcast(
8877 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8878 DAG.getConstant(BlendMask, DL, MVT::i8)));
8882 // For integer shuffles we need to expand the mask and cast the inputs to
8883 // v8i16s prior to blending.
8884 int Scale = 8 / VT.getVectorNumElements();
8885 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8886 V1 = DAG.getBitcast(MVT::v8i16, V1);
8887 V2 = DAG.getBitcast(MVT::v8i16, V2);
8888 return DAG.getBitcast(VT,
8889 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8890 DAG.getConstant(BlendMask, DL, MVT::i8)));
8894 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8895 SmallVector<int, 8> RepeatedMask;
8896 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8897 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8898 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8900 for (int i = 0; i < 8; ++i)
8901 if (RepeatedMask[i] >= 8)
8902 BlendMask |= 1ull << i;
8903 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8904 DAG.getConstant(BlendMask, DL, MVT::i8));
8910 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8911 "256-bit byte-blends require AVX2 support!");
8913 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8915 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8916 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8917 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8920 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8921 if (SDValue Masked =
8922 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8925 // Scale the blend by the number of bytes per element.
8926 int Scale = VT.getScalarSizeInBits() / 8;
8928 // This form of blend is always done on bytes. Compute the byte vector
8930 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8932 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8933 // mix of LLVM's code generator and the x86 backend. We tell the code
8934 // generator that boolean values in the elements of an x86 vector register
8935 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8936 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8937 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8938 // of the element (the remaining are ignored) and 0 in that high bit would
8939 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8940 // the LLVM model for boolean values in vector elements gets the relevant
8941 // bit set, it is set backwards and over constrained relative to x86's
8943 SmallVector<SDValue, 32> VSELECTMask;
8944 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8945 for (int j = 0; j < Scale; ++j)
8946 VSELECTMask.push_back(
8947 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8948 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8951 V1 = DAG.getBitcast(BlendVT, V1);
8952 V2 = DAG.getBitcast(BlendVT, V2);
8953 return DAG.getBitcast(
8955 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8965 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8966 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8967 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8970 llvm_unreachable("Not a supported integer vector type!");
8974 /// \brief Try to lower as a blend of elements from two inputs followed by
8975 /// a single-input permutation.
8977 /// This matches the pattern where we can blend elements from two inputs and
8978 /// then reduce the shuffle to a single-input permutation.
8979 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8980 SDValue V1, SDValue V2,
8982 SelectionDAG &DAG) {
8983 // We build up the blend mask while checking whether a blend is a viable way
8984 // to reduce the shuffle.
8985 SmallVector<int, 32> BlendMask(Mask.size(), -1);
8986 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
8988 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8992 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
8994 if (BlendMask[Mask[i] % Size] < 0)
8995 BlendMask[Mask[i] % Size] = Mask[i];
8996 else if (BlendMask[Mask[i] % Size] != Mask[i])
8997 return SDValue(); // Can't blend in the needed input!
8999 PermuteMask[i] = Mask[i] % Size;
9002 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9003 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9006 /// \brief Generic routine to decompose a shuffle and blend into independent
9007 /// blends and permutes.
9009 /// This matches the extremely common pattern for handling combined
9010 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9011 /// operations. It will try to pick the best arrangement of shuffles and
9013 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9017 SelectionDAG &DAG) {
9018 // Shuffle the input elements into the desired positions in V1 and V2 and
9019 // blend them together.
9020 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9021 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9022 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9023 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9024 if (Mask[i] >= 0 && Mask[i] < Size) {
9025 V1Mask[i] = Mask[i];
9027 } else if (Mask[i] >= Size) {
9028 V2Mask[i] = Mask[i] - Size;
9029 BlendMask[i] = i + Size;
9032 // Try to lower with the simpler initial blend strategy unless one of the
9033 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9034 // shuffle may be able to fold with a load or other benefit. However, when
9035 // we'll have to do 2x as many shuffles in order to achieve this, blending
9036 // first is a better strategy.
9037 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9038 if (SDValue BlendPerm =
9039 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9042 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9043 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9044 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9047 /// \brief Try to lower a vector shuffle as a rotation.
9049 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9050 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9051 ArrayRef<int> Mask) {
9052 int NumElts = Mask.size();
9054 // We need to detect various ways of spelling a rotation:
9055 // [11, 12, 13, 14, 15, 0, 1, 2]
9056 // [-1, 12, 13, 14, -1, -1, 1, -1]
9057 // [-1, -1, -1, -1, -1, -1, 1, 2]
9058 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9059 // [-1, 4, 5, 6, -1, -1, 9, -1]
9060 // [-1, 4, 5, 6, -1, -1, -1, -1]
9063 for (int i = 0; i < NumElts; ++i) {
9065 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9066 "Unexpected mask index.");
9070 // Determine where a rotated vector would have started.
9071 int StartIdx = i - (M % NumElts);
9073 // The identity rotation isn't interesting, stop.
9076 // If we found the tail of a vector the rotation must be the missing
9077 // front. If we found the head of a vector, it must be how much of the
9079 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9082 Rotation = CandidateRotation;
9083 else if (Rotation != CandidateRotation)
9084 // The rotations don't match, so we can't match this mask.
9087 // Compute which value this mask is pointing at.
9088 SDValue MaskV = M < NumElts ? V1 : V2;
9090 // Compute which of the two target values this index should be assigned
9091 // to. This reflects whether the high elements are remaining or the low
9092 // elements are remaining.
9093 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9095 // Either set up this value if we've not encountered it before, or check
9096 // that it remains consistent.
9099 else if (TargetV != MaskV)
9100 // This may be a rotation, but it pulls from the inputs in some
9101 // unsupported interleaving.
9105 // Check that we successfully analyzed the mask, and normalize the results.
9106 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9107 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9119 /// \brief Try to lower a vector shuffle as a byte rotation.
9121 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9122 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9123 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9124 /// try to generically lower a vector shuffle through such an pattern. It
9125 /// does not check for the profitability of lowering either as PALIGNR or
9126 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9127 /// This matches shuffle vectors that look like:
9129 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9131 /// Essentially it concatenates V1 and V2, shifts right by some number of
9132 /// elements, and takes the low elements as the result. Note that while this is
9133 /// specified as a *right shift* because x86 is little-endian, it is a *left
9134 /// rotate* of the vector lanes.
9135 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9136 ArrayRef<int> Mask) {
9137 // Don't accept any shuffles with zero elements.
9138 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9141 // PALIGNR works on 128-bit lanes.
9142 SmallVector<int, 16> RepeatedMask;
9143 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9146 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9150 // PALIGNR rotates bytes, so we need to scale the
9151 // rotation based on how many bytes are in the vector lane.
9152 int NumElts = RepeatedMask.size();
9153 int Scale = 16 / NumElts;
9154 return Rotation * Scale;
9157 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9158 SDValue V1, SDValue V2,
9160 const X86Subtarget &Subtarget,
9161 SelectionDAG &DAG) {
9162 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9164 SDValue Lo = V1, Hi = V2;
9165 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9166 if (ByteRotation <= 0)
9169 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9171 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9172 Lo = DAG.getBitcast(ByteVT, Lo);
9173 Hi = DAG.getBitcast(ByteVT, Hi);
9175 // SSSE3 targets can use the palignr instruction.
9176 if (Subtarget.hasSSSE3()) {
9177 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9178 "512-bit PALIGNR requires BWI instructions");
9179 return DAG.getBitcast(
9180 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9181 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9184 assert(VT.is128BitVector() &&
9185 "Rotate-based lowering only supports 128-bit lowering!");
9186 assert(Mask.size() <= 16 &&
9187 "Can shuffle at most 16 bytes in a 128-bit vector!");
9188 assert(ByteVT == MVT::v16i8 &&
9189 "SSE2 rotate lowering only needed for v16i8!");
9191 // Default SSE2 implementation
9192 int LoByteShift = 16 - ByteRotation;
9193 int HiByteShift = ByteRotation;
9195 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9196 DAG.getConstant(LoByteShift, DL, MVT::i8));
9197 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9198 DAG.getConstant(HiByteShift, DL, MVT::i8));
9199 return DAG.getBitcast(VT,
9200 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9203 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9205 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9206 /// rotation of the concatenation of two vectors; This routine will
9207 /// try to generically lower a vector shuffle through such an pattern.
9209 /// Essentially it concatenates V1 and V2, shifts right by some number of
9210 /// elements, and takes the low elements as the result. Note that while this is
9211 /// specified as a *right shift* because x86 is little-endian, it is a *left
9212 /// rotate* of the vector lanes.
9213 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9214 SDValue V1, SDValue V2,
9216 const X86Subtarget &Subtarget,
9217 SelectionDAG &DAG) {
9218 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9219 "Only 32-bit and 64-bit elements are supported!");
9221 // 128/256-bit vectors are only supported with VLX.
9222 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9223 && "VLX required for 128/256-bit vectors");
9225 SDValue Lo = V1, Hi = V2;
9226 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9230 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9231 DAG.getConstant(Rotation, DL, MVT::i8));
9234 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9236 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9237 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9238 /// matches elements from one of the input vectors shuffled to the left or
9239 /// right with zeroable elements 'shifted in'. It handles both the strictly
9240 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9243 /// PSHL : (little-endian) left bit shift.
9244 /// [ zz, 0, zz, 2 ]
9245 /// [ -1, 4, zz, -1 ]
9246 /// PSRL : (little-endian) right bit shift.
9248 /// [ -1, -1, 7, zz]
9249 /// PSLLDQ : (little-endian) left byte shift
9250 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9251 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9252 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9253 /// PSRLDQ : (little-endian) right byte shift
9254 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9255 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9256 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9257 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9258 unsigned ScalarSizeInBits,
9259 ArrayRef<int> Mask, int MaskOffset,
9260 const APInt &Zeroable,
9261 const X86Subtarget &Subtarget) {
9262 int Size = Mask.size();
9263 unsigned SizeInBits = Size * ScalarSizeInBits;
9265 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9266 for (int i = 0; i < Size; i += Scale)
9267 for (int j = 0; j < Shift; ++j)
9268 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9274 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9275 for (int i = 0; i != Size; i += Scale) {
9276 unsigned Pos = Left ? i + Shift : i;
9277 unsigned Low = Left ? i : i + Shift;
9278 unsigned Len = Scale - Shift;
9279 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9283 int ShiftEltBits = ScalarSizeInBits * Scale;
9284 bool ByteShift = ShiftEltBits > 64;
9285 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9286 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9287 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9289 // Normalize the scale for byte shifts to still produce an i64 element
9291 Scale = ByteShift ? Scale / 2 : Scale;
9293 // We need to round trip through the appropriate type for the shift.
9294 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9295 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9296 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9297 return (int)ShiftAmt;
9300 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9301 // keep doubling the size of the integer elements up to that. We can
9302 // then shift the elements of the integer vector by whole multiples of
9303 // their width within the elements of the larger integer vector. Test each
9304 // multiple to see if we can find a match with the moved element indices
9305 // and that the shifted in elements are all zeroable.
9306 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9307 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9308 for (int Shift = 1; Shift != Scale; ++Shift)
9309 for (bool Left : {true, false})
9310 if (CheckZeros(Shift, Scale, Left)) {
9311 int ShiftAmt = MatchShift(Shift, Scale, Left);
9320 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9321 SDValue V2, ArrayRef<int> Mask,
9322 const APInt &Zeroable,
9323 const X86Subtarget &Subtarget,
9324 SelectionDAG &DAG) {
9325 int Size = Mask.size();
9326 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9332 // Try to match shuffle against V1 shift.
9333 int ShiftAmt = matchVectorShuffleAsShift(
9334 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9336 // If V1 failed, try to match shuffle against V2 shift.
9339 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9340 Mask, Size, Zeroable, Subtarget);
9347 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9348 "Illegal integer vector type");
9349 V = DAG.getBitcast(ShiftVT, V);
9350 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9351 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9352 return DAG.getBitcast(VT, V);
9355 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9356 // Remainder of lower half result is zero and upper half is all undef.
9357 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9358 ArrayRef<int> Mask, uint64_t &BitLen,
9359 uint64_t &BitIdx, const APInt &Zeroable) {
9360 int Size = Mask.size();
9361 int HalfSize = Size / 2;
9362 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9363 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9365 // Upper half must be undefined.
9366 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9369 // Determine the extraction length from the part of the
9370 // lower half that isn't zeroable.
9372 for (; Len > 0; --Len)
9373 if (!Zeroable[Len - 1])
9375 assert(Len > 0 && "Zeroable shuffle mask");
9377 // Attempt to match first Len sequential elements from the lower half.
9380 for (int i = 0; i != Len; ++i) {
9382 if (M == SM_SentinelUndef)
9384 SDValue &V = (M < Size ? V1 : V2);
9387 // The extracted elements must start at a valid index and all mask
9388 // elements must be in the lower half.
9389 if (i > M || M >= HalfSize)
9392 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9400 if (!Src || Idx < 0)
9403 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9404 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9405 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9410 // INSERTQ: Extract lowest Len elements from lower half of second source and
9411 // insert over first source, starting at Idx.
9412 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9413 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9414 ArrayRef<int> Mask, uint64_t &BitLen,
9416 int Size = Mask.size();
9417 int HalfSize = Size / 2;
9418 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9420 // Upper half must be undefined.
9421 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9424 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9427 // Attempt to match first source from mask before insertion point.
9428 if (isUndefInRange(Mask, 0, Idx)) {
9430 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9432 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9438 // Extend the extraction length looking to match both the insertion of
9439 // the second source and the remaining elements of the first.
9440 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9445 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9447 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9453 // Match the remaining elements of the lower half.
9454 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9456 } else if ((!Base || (Base == V1)) &&
9457 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9459 } else if ((!Base || (Base == V2)) &&
9460 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9467 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9468 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9478 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9479 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9480 SDValue V2, ArrayRef<int> Mask,
9481 const APInt &Zeroable,
9482 SelectionDAG &DAG) {
9483 uint64_t BitLen, BitIdx;
9484 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9485 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9486 DAG.getConstant(BitLen, DL, MVT::i8),
9487 DAG.getConstant(BitIdx, DL, MVT::i8));
9489 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9490 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9491 V2 ? V2 : DAG.getUNDEF(VT),
9492 DAG.getConstant(BitLen, DL, MVT::i8),
9493 DAG.getConstant(BitIdx, DL, MVT::i8));
9498 /// \brief Lower a vector shuffle as a zero or any extension.
9500 /// Given a specific number of elements, element bit width, and extension
9501 /// stride, produce either a zero or any extension based on the available
9502 /// features of the subtarget. The extended elements are consecutive and
9503 /// begin and can start from an offsetted element index in the input; to
9504 /// avoid excess shuffling the offset must either being in the bottom lane
9505 /// or at the start of a higher lane. All extended elements must be from
9507 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9508 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9509 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9510 assert(Scale > 1 && "Need a scale to extend.");
9511 int EltBits = VT.getScalarSizeInBits();
9512 int NumElements = VT.getVectorNumElements();
9513 int NumEltsPerLane = 128 / EltBits;
9514 int OffsetLane = Offset / NumEltsPerLane;
9515 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9516 "Only 8, 16, and 32 bit elements can be extended.");
9517 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9518 assert(0 <= Offset && "Extension offset must be positive.");
9519 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9520 "Extension offset must be in the first lane or start an upper lane.");
9522 // Check that an index is in same lane as the base offset.
9523 auto SafeOffset = [&](int Idx) {
9524 return OffsetLane == (Idx / NumEltsPerLane);
9527 // Shift along an input so that the offset base moves to the first element.
9528 auto ShuffleOffset = [&](SDValue V) {
9532 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9533 for (int i = 0; i * Scale < NumElements; ++i) {
9534 int SrcIdx = i + Offset;
9535 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9537 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9540 // Found a valid zext mask! Try various lowering strategies based on the
9541 // input type and available ISA extensions.
9542 if (Subtarget.hasSSE41()) {
9543 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9544 // PUNPCK will catch this in a later shuffle match.
9545 if (Offset && Scale == 2 && VT.is128BitVector())
9547 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9548 NumElements / Scale);
9549 InputV = ShuffleOffset(InputV);
9550 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9551 return DAG.getBitcast(VT, InputV);
9554 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9556 // For any extends we can cheat for larger element sizes and use shuffle
9557 // instructions that can fold with a load and/or copy.
9558 if (AnyExt && EltBits == 32) {
9559 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9561 return DAG.getBitcast(
9562 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9563 DAG.getBitcast(MVT::v4i32, InputV),
9564 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9566 if (AnyExt && EltBits == 16 && Scale > 2) {
9567 int PSHUFDMask[4] = {Offset / 2, -1,
9568 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9569 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9570 DAG.getBitcast(MVT::v4i32, InputV),
9571 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9572 int PSHUFWMask[4] = {1, -1, -1, -1};
9573 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9574 return DAG.getBitcast(
9575 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9576 DAG.getBitcast(MVT::v8i16, InputV),
9577 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9580 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9582 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9583 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9584 assert(VT.is128BitVector() && "Unexpected vector width!");
9586 int LoIdx = Offset * EltBits;
9587 SDValue Lo = DAG.getBitcast(
9588 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9589 DAG.getConstant(EltBits, DL, MVT::i8),
9590 DAG.getConstant(LoIdx, DL, MVT::i8)));
9592 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9593 !SafeOffset(Offset + 1))
9594 return DAG.getBitcast(VT, Lo);
9596 int HiIdx = (Offset + 1) * EltBits;
9597 SDValue Hi = DAG.getBitcast(
9598 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9599 DAG.getConstant(EltBits, DL, MVT::i8),
9600 DAG.getConstant(HiIdx, DL, MVT::i8)));
9601 return DAG.getBitcast(VT,
9602 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9605 // If this would require more than 2 unpack instructions to expand, use
9606 // pshufb when available. We can only use more than 2 unpack instructions
9607 // when zero extending i8 elements which also makes it easier to use pshufb.
9608 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9609 assert(NumElements == 16 && "Unexpected byte vector width!");
9610 SDValue PSHUFBMask[16];
9611 for (int i = 0; i < 16; ++i) {
9612 int Idx = Offset + (i / Scale);
9613 PSHUFBMask[i] = DAG.getConstant(
9614 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9616 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9617 return DAG.getBitcast(
9618 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9619 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9622 // If we are extending from an offset, ensure we start on a boundary that
9623 // we can unpack from.
9624 int AlignToUnpack = Offset % (NumElements / Scale);
9625 if (AlignToUnpack) {
9626 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9627 for (int i = AlignToUnpack; i < NumElements; ++i)
9628 ShMask[i - AlignToUnpack] = i;
9629 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9630 Offset -= AlignToUnpack;
9633 // Otherwise emit a sequence of unpacks.
9635 unsigned UnpackLoHi = X86ISD::UNPCKL;
9636 if (Offset >= (NumElements / 2)) {
9637 UnpackLoHi = X86ISD::UNPCKH;
9638 Offset -= (NumElements / 2);
9641 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9642 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9643 : getZeroVector(InputVT, Subtarget, DAG, DL);
9644 InputV = DAG.getBitcast(InputVT, InputV);
9645 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9649 } while (Scale > 1);
9650 return DAG.getBitcast(VT, InputV);
9653 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9655 /// This routine will try to do everything in its power to cleverly lower
9656 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9657 /// check for the profitability of this lowering, it tries to aggressively
9658 /// match this pattern. It will use all of the micro-architectural details it
9659 /// can to emit an efficient lowering. It handles both blends with all-zero
9660 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9661 /// masking out later).
9663 /// The reason we have dedicated lowering for zext-style shuffles is that they
9664 /// are both incredibly common and often quite performance sensitive.
9665 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9666 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9667 const APInt &Zeroable, const X86Subtarget &Subtarget,
9668 SelectionDAG &DAG) {
9669 int Bits = VT.getSizeInBits();
9670 int NumLanes = Bits / 128;
9671 int NumElements = VT.getVectorNumElements();
9672 int NumEltsPerLane = NumElements / NumLanes;
9673 assert(VT.getScalarSizeInBits() <= 32 &&
9674 "Exceeds 32-bit integer zero extension limit");
9675 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9677 // Define a helper function to check a particular ext-scale and lower to it if
9679 auto Lower = [&](int Scale) -> SDValue {
9684 for (int i = 0; i < NumElements; ++i) {
9687 continue; // Valid anywhere but doesn't tell us anything.
9688 if (i % Scale != 0) {
9689 // Each of the extended elements need to be zeroable.
9693 // We no longer are in the anyext case.
9698 // Each of the base elements needs to be consecutive indices into the
9699 // same input vector.
9700 SDValue V = M < NumElements ? V1 : V2;
9701 M = M % NumElements;
9704 Offset = M - (i / Scale);
9705 } else if (InputV != V)
9706 return SDValue(); // Flip-flopping inputs.
9708 // Offset must start in the lowest 128-bit lane or at the start of an
9710 // FIXME: Is it ever worth allowing a negative base offset?
9711 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9712 (Offset % NumEltsPerLane) == 0))
9715 // If we are offsetting, all referenced entries must come from the same
9717 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9720 if ((M % NumElements) != (Offset + (i / Scale)))
9721 return SDValue(); // Non-consecutive strided elements.
9725 // If we fail to find an input, we have a zero-shuffle which should always
9726 // have already been handled.
9727 // FIXME: Maybe handle this here in case during blending we end up with one?
9731 // If we are offsetting, don't extend if we only match a single input, we
9732 // can always do better by using a basic PSHUF or PUNPCK.
9733 if (Offset != 0 && Matches < 2)
9736 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9737 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9740 // The widest scale possible for extending is to a 64-bit integer.
9741 assert(Bits % 64 == 0 &&
9742 "The number of bits in a vector must be divisible by 64 on x86!");
9743 int NumExtElements = Bits / 64;
9745 // Each iteration, try extending the elements half as much, but into twice as
9747 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9748 assert(NumElements % NumExtElements == 0 &&
9749 "The input vector size must be divisible by the extended size.");
9750 if (SDValue V = Lower(NumElements / NumExtElements))
9754 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9758 // Returns one of the source operands if the shuffle can be reduced to a
9759 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9760 auto CanZExtLowHalf = [&]() {
9761 for (int i = NumElements / 2; i != NumElements; ++i)
9764 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9766 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9771 if (SDValue V = CanZExtLowHalf()) {
9772 V = DAG.getBitcast(MVT::v2i64, V);
9773 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9774 return DAG.getBitcast(VT, V);
9777 // No viable ext lowering found.
9781 /// \brief Try to get a scalar value for a specific element of a vector.
9783 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9784 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9785 SelectionDAG &DAG) {
9786 MVT VT = V.getSimpleValueType();
9787 MVT EltVT = VT.getVectorElementType();
9788 V = peekThroughBitcasts(V);
9790 // If the bitcasts shift the element size, we can't extract an equivalent
9792 MVT NewVT = V.getSimpleValueType();
9793 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9796 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9797 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9798 // Ensure the scalar operand is the same size as the destination.
9799 // FIXME: Add support for scalar truncation where possible.
9800 SDValue S = V.getOperand(Idx);
9801 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9802 return DAG.getBitcast(EltVT, S);
9808 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9810 /// This is particularly important because the set of instructions varies
9811 /// significantly based on whether the operand is a load or not.
9812 static bool isShuffleFoldableLoad(SDValue V) {
9813 V = peekThroughBitcasts(V);
9814 return ISD::isNON_EXTLoad(V.getNode());
9817 /// \brief Try to lower insertion of a single element into a zero vector.
9819 /// This is a common pattern that we have especially efficient patterns to lower
9820 /// across all subtarget feature sets.
9821 static SDValue lowerVectorShuffleAsElementInsertion(
9822 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9823 const APInt &Zeroable, const X86Subtarget &Subtarget,
9824 SelectionDAG &DAG) {
9826 MVT EltVT = VT.getVectorElementType();
9829 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9831 bool IsV1Zeroable = true;
9832 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9833 if (i != V2Index && !Zeroable[i]) {
9834 IsV1Zeroable = false;
9838 // Check for a single input from a SCALAR_TO_VECTOR node.
9839 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9840 // all the smarts here sunk into that routine. However, the current
9841 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9842 // vector shuffle lowering is dead.
9843 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9845 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9846 // We need to zext the scalar if it is smaller than an i32.
9847 V2S = DAG.getBitcast(EltVT, V2S);
9848 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9849 // Using zext to expand a narrow element won't work for non-zero
9854 // Zero-extend directly to i32.
9856 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9858 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9859 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9860 EltVT == MVT::i16) {
9861 // Either not inserting from the low element of the input or the input
9862 // element size is too small to use VZEXT_MOVL to clear the high bits.
9866 if (!IsV1Zeroable) {
9867 // If V1 can't be treated as a zero vector we have fewer options to lower
9868 // this. We can't support integer vectors or non-zero targets cheaply, and
9869 // the V1 elements can't be permuted in any way.
9870 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9871 if (!VT.isFloatingPoint() || V2Index != 0)
9873 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9874 V1Mask[V2Index] = -1;
9875 if (!isNoopShuffleMask(V1Mask))
9877 // This is essentially a special case blend operation, but if we have
9878 // general purpose blend operations, they are always faster. Bail and let
9879 // the rest of the lowering handle these as blends.
9880 if (Subtarget.hasSSE41())
9883 // Otherwise, use MOVSD or MOVSS.
9884 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9885 "Only two types of floating point element types to handle!");
9886 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9890 // This lowering only works for the low element with floating point vectors.
9891 if (VT.isFloatingPoint() && V2Index != 0)
9894 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9896 V2 = DAG.getBitcast(VT, V2);
9899 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9900 // the desired position. Otherwise it is more efficient to do a vector
9901 // shift left. We know that we can do a vector shift left because all
9902 // the inputs are zero.
9903 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9904 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9905 V2Shuffle[V2Index] = 0;
9906 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9908 V2 = DAG.getBitcast(MVT::v16i8, V2);
9910 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9911 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9912 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9913 DAG.getDataLayout(), VT)));
9914 V2 = DAG.getBitcast(VT, V2);
9920 /// Try to lower broadcast of a single - truncated - integer element,
9921 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9923 /// This assumes we have AVX2.
9924 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9925 SDValue V0, int BroadcastIdx,
9926 const X86Subtarget &Subtarget,
9927 SelectionDAG &DAG) {
9928 assert(Subtarget.hasAVX2() &&
9929 "We can only lower integer broadcasts with AVX2!");
9931 EVT EltVT = VT.getVectorElementType();
9932 EVT V0VT = V0.getValueType();
9934 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9935 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9937 EVT V0EltVT = V0VT.getVectorElementType();
9938 if (!V0EltVT.isInteger())
9941 const unsigned EltSize = EltVT.getSizeInBits();
9942 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9944 // This is only a truncation if the original element type is larger.
9945 if (V0EltSize <= EltSize)
9948 assert(((V0EltSize % EltSize) == 0) &&
9949 "Scalar type sizes must all be powers of 2 on x86!");
9951 const unsigned V0Opc = V0.getOpcode();
9952 const unsigned Scale = V0EltSize / EltSize;
9953 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9955 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9956 V0Opc != ISD::BUILD_VECTOR)
9959 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9961 // If we're extracting non-least-significant bits, shift so we can truncate.
9962 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9963 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9964 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9965 if (const int OffsetIdx = BroadcastIdx % Scale)
9966 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9967 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9969 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9970 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9973 /// \brief Try to lower broadcast of a single element.
9975 /// For convenience, this code also bundles all of the subtarget feature set
9976 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9977 /// a convenient way to factor it out.
9978 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9979 SDValue V1, SDValue V2,
9981 const X86Subtarget &Subtarget,
9982 SelectionDAG &DAG) {
9983 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
9984 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
9985 (Subtarget.hasAVX2() && VT.isInteger())))
9988 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
9989 // we can only broadcast from a register with AVX2.
9990 unsigned NumElts = Mask.size();
9991 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
9992 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
9994 // Check that the mask is a broadcast.
9995 int BroadcastIdx = -1;
9996 for (int i = 0; i != (int)NumElts; ++i) {
9997 SmallVector<int, 8> BroadcastMask(NumElts, i);
9998 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10004 if (BroadcastIdx < 0)
10006 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10007 "a sorted mask where the broadcast "
10010 // Go up the chain of (vector) values to find a scalar load that we can
10011 // combine with the broadcast.
10014 switch (V.getOpcode()) {
10015 case ISD::BITCAST: {
10016 SDValue VSrc = V.getOperand(0);
10017 MVT SrcVT = VSrc.getSimpleValueType();
10018 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10023 case ISD::CONCAT_VECTORS: {
10024 int OperandSize = Mask.size() / V.getNumOperands();
10025 V = V.getOperand(BroadcastIdx / OperandSize);
10026 BroadcastIdx %= OperandSize;
10029 case ISD::INSERT_SUBVECTOR: {
10030 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10031 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10035 int BeginIdx = (int)ConstantIdx->getZExtValue();
10037 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10038 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10039 BroadcastIdx -= BeginIdx;
10050 // Check if this is a broadcast of a scalar. We special case lowering
10051 // for scalars so that we can more effectively fold with loads.
10052 // First, look through bitcast: if the original value has a larger element
10053 // type than the shuffle, the broadcast element is in essence truncated.
10054 // Make that explicit to ease folding.
10055 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10056 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10057 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10058 return TruncBroadcast;
10060 MVT BroadcastVT = VT;
10062 // Peek through any bitcast (only useful for loads).
10063 SDValue BC = peekThroughBitcasts(V);
10065 // Also check the simpler case, where we can directly reuse the scalar.
10066 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10067 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10068 V = V.getOperand(BroadcastIdx);
10070 // If we can't broadcast from a register, check that the input is a load.
10071 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10073 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10074 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10075 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10076 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10077 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
10080 // If we are broadcasting a load that is only used by the shuffle
10081 // then we can reduce the vector load to the broadcasted scalar load.
10082 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10083 SDValue BaseAddr = Ld->getOperand(1);
10084 EVT SVT = BroadcastVT.getScalarType();
10085 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10086 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10087 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10088 DAG.getMachineFunction().getMachineMemOperand(
10089 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10090 DAG.makeEquivalentMemoryOrdering(Ld, V);
10091 } else if (!BroadcastFromReg) {
10092 // We can't broadcast from a vector register.
10094 } else if (BroadcastIdx != 0) {
10095 // We can only broadcast from the zero-element of a vector register,
10096 // but it can be advantageous to broadcast from the zero-element of a
10098 if (!VT.is256BitVector() && !VT.is512BitVector())
10101 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10102 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10105 // Only broadcast the zero-element of a 128-bit subvector.
10106 unsigned EltSize = VT.getScalarSizeInBits();
10107 if (((BroadcastIdx * EltSize) % 128) != 0)
10110 // The shuffle input might have been a bitcast we looked through; look at
10111 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10112 // later bitcast it to BroadcastVT.
10113 MVT SrcVT = V.getSimpleValueType();
10114 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10115 "Unexpected vector element size");
10116 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10117 "Unexpected vector size");
10119 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10120 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10121 DAG.getIntPtrConstant(BroadcastIdx, DL));
10124 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10125 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10126 DAG.getBitcast(MVT::f64, V));
10128 // Bitcast back to the same scalar type as BroadcastVT.
10129 MVT SrcVT = V.getSimpleValueType();
10130 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10131 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10132 "Unexpected vector element size");
10133 if (SrcVT.isVector()) {
10134 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10135 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10137 SrcVT = BroadcastVT.getScalarType();
10139 V = DAG.getBitcast(SrcVT, V);
10142 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10143 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10144 V = DAG.getBitcast(MVT::f64, V);
10145 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10146 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10149 // We only support broadcasting from 128-bit vectors to minimize the
10150 // number of patterns we need to deal with in isel. So extract down to
10152 if (SrcVT.getSizeInBits() > 128)
10153 V = extract128BitVector(V, 0, DAG, DL);
10155 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10158 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10159 // INSERTPS when the V1 elements are already in the correct locations
10160 // because otherwise we can just always use two SHUFPS instructions which
10161 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10162 // perform INSERTPS if a single V1 element is out of place and all V2
10163 // elements are zeroable.
10164 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10165 unsigned &InsertPSMask,
10166 const APInt &Zeroable,
10167 ArrayRef<int> Mask,
10168 SelectionDAG &DAG) {
10169 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10170 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10171 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10173 // Attempt to match INSERTPS with one element from VA or VB being
10174 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10176 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10177 ArrayRef<int> CandidateMask) {
10178 unsigned ZMask = 0;
10179 int VADstIndex = -1;
10180 int VBDstIndex = -1;
10181 bool VAUsedInPlace = false;
10183 for (int i = 0; i < 4; ++i) {
10184 // Synthesize a zero mask from the zeroable elements (includes undefs).
10190 // Flag if we use any VA inputs in place.
10191 if (i == CandidateMask[i]) {
10192 VAUsedInPlace = true;
10196 // We can only insert a single non-zeroable element.
10197 if (VADstIndex >= 0 || VBDstIndex >= 0)
10200 if (CandidateMask[i] < 4) {
10201 // VA input out of place for insertion.
10204 // VB input for insertion.
10209 // Don't bother if we have no (non-zeroable) element for insertion.
10210 if (VADstIndex < 0 && VBDstIndex < 0)
10213 // Determine element insertion src/dst indices. The src index is from the
10214 // start of the inserted vector, not the start of the concatenated vector.
10215 unsigned VBSrcIndex = 0;
10216 if (VADstIndex >= 0) {
10217 // If we have a VA input out of place, we use VA as the V2 element
10218 // insertion and don't use the original V2 at all.
10219 VBSrcIndex = CandidateMask[VADstIndex];
10220 VBDstIndex = VADstIndex;
10223 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10226 // If no V1 inputs are used in place, then the result is created only from
10227 // the zero mask and the V2 insertion - so remove V1 dependency.
10228 if (!VAUsedInPlace)
10229 VA = DAG.getUNDEF(MVT::v4f32);
10231 // Update V1, V2 and InsertPSMask accordingly.
10235 // Insert the V2 element into the desired position.
10236 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10237 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10241 if (matchAsInsertPS(V1, V2, Mask))
10244 // Commute and try again.
10245 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10246 ShuffleVectorSDNode::commuteMask(CommutedMask);
10247 if (matchAsInsertPS(V2, V1, CommutedMask))
10253 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10254 SDValue V2, ArrayRef<int> Mask,
10255 const APInt &Zeroable,
10256 SelectionDAG &DAG) {
10257 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10258 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10260 // Attempt to match the insertps pattern.
10261 unsigned InsertPSMask;
10262 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10265 // Insert the V2 element into the desired position.
10266 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10267 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10270 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10271 /// UNPCK instruction.
10273 /// This specifically targets cases where we end up with alternating between
10274 /// the two inputs, and so can permute them into something that feeds a single
10275 /// UNPCK instruction. Note that this routine only targets integer vectors
10276 /// because for floating point vectors we have a generalized SHUFPS lowering
10277 /// strategy that handles everything that doesn't *exactly* match an unpack,
10278 /// making this clever lowering unnecessary.
10279 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10280 SDValue V1, SDValue V2,
10281 ArrayRef<int> Mask,
10282 SelectionDAG &DAG) {
10283 assert(!VT.isFloatingPoint() &&
10284 "This routine only supports integer vectors.");
10285 assert(VT.is128BitVector() &&
10286 "This routine only works on 128-bit vectors.");
10287 assert(!V2.isUndef() &&
10288 "This routine should only be used when blending two inputs.");
10289 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10291 int Size = Mask.size();
10294 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10296 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10298 bool UnpackLo = NumLoInputs >= NumHiInputs;
10300 auto TryUnpack = [&](int ScalarSize, int Scale) {
10301 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10302 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10304 for (int i = 0; i < Size; ++i) {
10308 // Each element of the unpack contains Scale elements from this mask.
10309 int UnpackIdx = i / Scale;
10311 // We only handle the case where V1 feeds the first slots of the unpack.
10312 // We rely on canonicalization to ensure this is the case.
10313 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10316 // Setup the mask for this input. The indexing is tricky as we have to
10317 // handle the unpack stride.
10318 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10319 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10323 // If we will have to shuffle both inputs to use the unpack, check whether
10324 // we can just unpack first and shuffle the result. If so, skip this unpack.
10325 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10326 !isNoopShuffleMask(V2Mask))
10329 // Shuffle the inputs into place.
10330 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10331 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10333 // Cast the inputs to the type we will use to unpack them.
10334 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10335 V1 = DAG.getBitcast(UnpackVT, V1);
10336 V2 = DAG.getBitcast(UnpackVT, V2);
10338 // Unpack the inputs and cast the result back to the desired type.
10339 return DAG.getBitcast(
10340 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10341 UnpackVT, V1, V2));
10344 // We try each unpack from the largest to the smallest to try and find one
10345 // that fits this mask.
10346 int OrigScalarSize = VT.getScalarSizeInBits();
10347 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10348 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10351 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10353 if (NumLoInputs == 0 || NumHiInputs == 0) {
10354 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10355 "We have to have *some* inputs!");
10356 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10358 // FIXME: We could consider the total complexity of the permute of each
10359 // possible unpacking. Or at the least we should consider how many
10360 // half-crossings are created.
10361 // FIXME: We could consider commuting the unpacks.
10363 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10364 for (int i = 0; i < Size; ++i) {
10368 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10371 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10373 return DAG.getVectorShuffle(
10374 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10376 DAG.getUNDEF(VT), PermMask);
10382 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10384 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10385 /// support for floating point shuffles but not integer shuffles. These
10386 /// instructions will incur a domain crossing penalty on some chips though so
10387 /// it is better to avoid lowering through this for integer vectors where
10389 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10390 const APInt &Zeroable,
10391 SDValue V1, SDValue V2,
10392 const X86Subtarget &Subtarget,
10393 SelectionDAG &DAG) {
10394 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10395 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10396 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10398 if (V2.isUndef()) {
10399 // Check for being able to broadcast a single element.
10400 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10401 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10404 // Straight shuffle of a single input vector. Simulate this by using the
10405 // single input as both of the "inputs" to this instruction..
10406 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10408 if (Subtarget.hasAVX()) {
10409 // If we have AVX, we can use VPERMILPS which will allow folding a load
10410 // into the shuffle.
10411 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10412 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10415 return DAG.getNode(
10416 X86ISD::SHUFP, DL, MVT::v2f64,
10417 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10418 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10419 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10421 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10422 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10424 // If we have a single input, insert that into V1 if we can do so cheaply.
10425 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10426 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10427 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10429 // Try inverting the insertion since for v2 masks it is easy to do and we
10430 // can't reliably sort the mask one way or the other.
10431 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10432 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10433 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10434 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10438 // Try to use one of the special instruction patterns to handle two common
10439 // blend patterns if a zero-blend above didn't work.
10440 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10441 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10442 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10443 // We can either use a special instruction to load over the low double or
10444 // to move just the low double.
10445 return DAG.getNode(
10446 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10447 DL, MVT::v2f64, V2,
10448 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10450 if (Subtarget.hasSSE41())
10451 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10452 Zeroable, Subtarget, DAG))
10455 // Use dedicated unpack instructions for masks that match their pattern.
10457 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10460 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10461 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10462 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10465 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10467 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10468 /// the integer unit to minimize domain crossing penalties. However, for blends
10469 /// it falls back to the floating point shuffle operation with appropriate bit
10471 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10472 const APInt &Zeroable,
10473 SDValue V1, SDValue V2,
10474 const X86Subtarget &Subtarget,
10475 SelectionDAG &DAG) {
10476 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10477 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10478 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10480 if (V2.isUndef()) {
10481 // Check for being able to broadcast a single element.
10482 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10483 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10486 // Straight shuffle of a single input vector. For everything from SSE2
10487 // onward this has a single fast instruction with no scary immediates.
10488 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10489 V1 = DAG.getBitcast(MVT::v4i32, V1);
10490 int WidenedMask[4] = {
10491 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10492 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10493 return DAG.getBitcast(
10495 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10496 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10498 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10499 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10500 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10501 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10503 // If we have a blend of two same-type PACKUS operations and the blend aligns
10504 // with the low and high halves, we can just merge the PACKUS operations.
10505 // This is particularly important as it lets us merge shuffles that this
10506 // routine itself creates.
10507 auto GetPackNode = [](SDValue V) {
10508 V = peekThroughBitcasts(V);
10509 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10511 if (SDValue V1Pack = GetPackNode(V1))
10512 if (SDValue V2Pack = GetPackNode(V2)) {
10513 EVT PackVT = V1Pack.getValueType();
10514 if (PackVT == V2Pack.getValueType())
10515 return DAG.getBitcast(MVT::v2i64,
10516 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10517 Mask[0] == 0 ? V1Pack.getOperand(0)
10518 : V1Pack.getOperand(1),
10519 Mask[1] == 2 ? V2Pack.getOperand(0)
10520 : V2Pack.getOperand(1)));
10523 // Try to use shift instructions.
10524 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10525 Zeroable, Subtarget, DAG))
10528 // When loading a scalar and then shuffling it into a vector we can often do
10529 // the insertion cheaply.
10530 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10531 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10533 // Try inverting the insertion since for v2 masks it is easy to do and we
10534 // can't reliably sort the mask one way or the other.
10535 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10536 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10537 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10540 // We have different paths for blend lowering, but they all must use the
10541 // *exact* same predicate.
10542 bool IsBlendSupported = Subtarget.hasSSE41();
10543 if (IsBlendSupported)
10544 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10545 Zeroable, Subtarget, DAG))
10548 // Use dedicated unpack instructions for masks that match their pattern.
10550 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10553 // Try to use byte rotation instructions.
10554 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10555 if (Subtarget.hasSSSE3())
10556 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10557 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10560 // If we have direct support for blends, we should lower by decomposing into
10561 // a permute. That will be faster than the domain cross.
10562 if (IsBlendSupported)
10563 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10566 // We implement this with SHUFPD which is pretty lame because it will likely
10567 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10568 // However, all the alternatives are still more cycles and newer chips don't
10569 // have this problem. It would be really nice if x86 had better shuffles here.
10570 V1 = DAG.getBitcast(MVT::v2f64, V1);
10571 V2 = DAG.getBitcast(MVT::v2f64, V2);
10572 return DAG.getBitcast(MVT::v2i64,
10573 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10576 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10578 /// This is used to disable more specialized lowerings when the shufps lowering
10579 /// will happen to be efficient.
10580 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10581 // This routine only handles 128-bit shufps.
10582 assert(Mask.size() == 4 && "Unsupported mask size!");
10583 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10584 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10585 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10586 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10588 // To lower with a single SHUFPS we need to have the low half and high half
10589 // each requiring a single input.
10590 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10592 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10598 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10600 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10601 /// It makes no assumptions about whether this is the *best* lowering, it simply
10603 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10604 ArrayRef<int> Mask, SDValue V1,
10605 SDValue V2, SelectionDAG &DAG) {
10606 SDValue LowV = V1, HighV = V2;
10607 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10609 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10611 if (NumV2Elements == 1) {
10612 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10614 // Compute the index adjacent to V2Index and in the same half by toggling
10616 int V2AdjIndex = V2Index ^ 1;
10618 if (Mask[V2AdjIndex] < 0) {
10619 // Handles all the cases where we have a single V2 element and an undef.
10620 // This will only ever happen in the high lanes because we commute the
10621 // vector otherwise.
10623 std::swap(LowV, HighV);
10624 NewMask[V2Index] -= 4;
10626 // Handle the case where the V2 element ends up adjacent to a V1 element.
10627 // To make this work, blend them together as the first step.
10628 int V1Index = V2AdjIndex;
10629 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10630 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10631 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10633 // Now proceed to reconstruct the final blend as we have the necessary
10634 // high or low half formed.
10641 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10642 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10644 } else if (NumV2Elements == 2) {
10645 if (Mask[0] < 4 && Mask[1] < 4) {
10646 // Handle the easy case where we have V1 in the low lanes and V2 in the
10650 } else if (Mask[2] < 4 && Mask[3] < 4) {
10651 // We also handle the reversed case because this utility may get called
10652 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10653 // arrange things in the right direction.
10659 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10660 // trying to place elements directly, just blend them and set up the final
10661 // shuffle to place them.
10663 // The first two blend mask elements are for V1, the second two are for
10665 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10666 Mask[2] < 4 ? Mask[2] : Mask[3],
10667 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10668 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10669 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10670 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10672 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10675 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10676 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10677 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10678 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10681 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10682 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10685 /// \brief Lower 4-lane 32-bit floating point shuffles.
10687 /// Uses instructions exclusively from the floating point unit to minimize
10688 /// domain crossing penalties, as these are sufficient to implement all v4f32
10690 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10691 const APInt &Zeroable,
10692 SDValue V1, SDValue V2,
10693 const X86Subtarget &Subtarget,
10694 SelectionDAG &DAG) {
10695 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10696 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10697 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10699 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10701 if (NumV2Elements == 0) {
10702 // Check for being able to broadcast a single element.
10703 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10704 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10707 // Use even/odd duplicate instructions for masks that match their pattern.
10708 if (Subtarget.hasSSE3()) {
10709 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10710 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10711 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10712 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10715 if (Subtarget.hasAVX()) {
10716 // If we have AVX, we can use VPERMILPS which will allow folding a load
10717 // into the shuffle.
10718 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10719 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10722 // Otherwise, use a straight shuffle of a single input vector. We pass the
10723 // input vector to both operands to simulate this with a SHUFPS.
10724 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10725 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10728 // There are special ways we can lower some single-element blends. However, we
10729 // have custom ways we can lower more complex single-element blends below that
10730 // we defer to if both this and BLENDPS fail to match, so restrict this to
10731 // when the V2 input is targeting element 0 of the mask -- that is the fast
10733 if (NumV2Elements == 1 && Mask[0] >= 4)
10734 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10735 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10738 if (Subtarget.hasSSE41()) {
10739 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10740 Zeroable, Subtarget, DAG))
10743 // Use INSERTPS if we can complete the shuffle efficiently.
10745 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10748 if (!isSingleSHUFPSMask(Mask))
10749 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10750 DL, MVT::v4f32, V1, V2, Mask, DAG))
10754 // Use low/high mov instructions.
10755 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10756 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10757 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10758 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10760 // Use dedicated unpack instructions for masks that match their pattern.
10762 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10765 // Otherwise fall back to a SHUFPS lowering strategy.
10766 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10769 /// \brief Lower 4-lane i32 vector shuffles.
10771 /// We try to handle these with integer-domain shuffles where we can, but for
10772 /// blends we use the floating point domain blend instructions.
10773 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10774 const APInt &Zeroable,
10775 SDValue V1, SDValue V2,
10776 const X86Subtarget &Subtarget,
10777 SelectionDAG &DAG) {
10778 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10779 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10780 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10782 // Whenever we can lower this as a zext, that instruction is strictly faster
10783 // than any alternative. It also allows us to fold memory operands into the
10784 // shuffle in many cases.
10785 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10786 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10789 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10791 if (NumV2Elements == 0) {
10792 // Check for being able to broadcast a single element.
10793 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10794 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10797 // Straight shuffle of a single input vector. For everything from SSE2
10798 // onward this has a single fast instruction with no scary immediates.
10799 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10800 // but we aren't actually going to use the UNPCK instruction because doing
10801 // so prevents folding a load into this instruction or making a copy.
10802 const int UnpackLoMask[] = {0, 0, 1, 1};
10803 const int UnpackHiMask[] = {2, 2, 3, 3};
10804 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10805 Mask = UnpackLoMask;
10806 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10807 Mask = UnpackHiMask;
10809 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10810 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10813 // Try to use shift instructions.
10814 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10815 Zeroable, Subtarget, DAG))
10818 // There are special ways we can lower some single-element blends.
10819 if (NumV2Elements == 1)
10820 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10821 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10824 // We have different paths for blend lowering, but they all must use the
10825 // *exact* same predicate.
10826 bool IsBlendSupported = Subtarget.hasSSE41();
10827 if (IsBlendSupported)
10828 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10829 Zeroable, Subtarget, DAG))
10832 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10836 // Use dedicated unpack instructions for masks that match their pattern.
10838 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10841 // Try to use byte rotation instructions.
10842 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10843 if (Subtarget.hasSSSE3())
10844 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10845 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10848 // Assume that a single SHUFPS is faster than an alternative sequence of
10849 // multiple instructions (even if the CPU has a domain penalty).
10850 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10851 if (!isSingleSHUFPSMask(Mask)) {
10852 // If we have direct support for blends, we should lower by decomposing into
10853 // a permute. That will be faster than the domain cross.
10854 if (IsBlendSupported)
10855 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10858 // Try to lower by permuting the inputs into an unpack instruction.
10859 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10860 DL, MVT::v4i32, V1, V2, Mask, DAG))
10864 // We implement this with SHUFPS because it can blend from two vectors.
10865 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10866 // up the inputs, bypassing domain shift penalties that we would incur if we
10867 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10869 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10870 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10871 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10872 return DAG.getBitcast(MVT::v4i32, ShufPS);
10875 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10876 /// shuffle lowering, and the most complex part.
10878 /// The lowering strategy is to try to form pairs of input lanes which are
10879 /// targeted at the same half of the final vector, and then use a dword shuffle
10880 /// to place them onto the right half, and finally unpack the paired lanes into
10881 /// their final position.
10883 /// The exact breakdown of how to form these dword pairs and align them on the
10884 /// correct sides is really tricky. See the comments within the function for
10885 /// more of the details.
10887 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10888 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10889 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10890 /// vector, form the analogous 128-bit 8-element Mask.
10891 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10892 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10893 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10894 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10895 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10897 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10898 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10899 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10901 SmallVector<int, 4> LoInputs;
10902 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10903 std::sort(LoInputs.begin(), LoInputs.end());
10904 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10905 SmallVector<int, 4> HiInputs;
10906 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10907 std::sort(HiInputs.begin(), HiInputs.end());
10908 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10910 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10911 int NumHToL = LoInputs.size() - NumLToL;
10913 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10914 int NumHToH = HiInputs.size() - NumLToH;
10915 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10916 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10917 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10918 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10920 // If we are splatting two values from one half - one to each half, then
10921 // we can shuffle that half so each is splatted to a dword, then splat those
10922 // to their respective halves.
10923 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10925 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10926 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10927 V = DAG.getNode(ShufWOp, DL, VT, V,
10928 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10929 V = DAG.getBitcast(PSHUFDVT, V);
10930 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10931 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10932 return DAG.getBitcast(VT, V);
10935 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10936 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10937 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10938 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10940 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10941 // such inputs we can swap two of the dwords across the half mark and end up
10942 // with <=2 inputs to each half in each half. Once there, we can fall through
10943 // to the generic code below. For example:
10945 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10946 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10948 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10949 // and an existing 2-into-2 on the other half. In this case we may have to
10950 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10951 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10952 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10953 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10954 // half than the one we target for fixing) will be fixed when we re-enter this
10955 // path. We will also combine away any sequence of PSHUFD instructions that
10956 // result into a single instruction. Here is an example of the tricky case:
10958 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10959 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10961 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10963 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10964 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10966 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10967 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10969 // The result is fine to be handled by the generic logic.
10970 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10971 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10972 int AOffset, int BOffset) {
10973 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10974 "Must call this with A having 3 or 1 inputs from the A half.");
10975 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10976 "Must call this with B having 1 or 3 inputs from the B half.");
10977 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10978 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10980 bool ThreeAInputs = AToAInputs.size() == 3;
10982 // Compute the index of dword with only one word among the three inputs in
10983 // a half by taking the sum of the half with three inputs and subtracting
10984 // the sum of the actual three inputs. The difference is the remaining
10986 int ADWord, BDWord;
10987 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
10988 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
10989 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
10990 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
10991 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
10992 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
10993 int TripleNonInputIdx =
10994 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
10995 TripleDWord = TripleNonInputIdx / 2;
10997 // We use xor with one to compute the adjacent DWord to whichever one the
10999 OneInputDWord = (OneInput / 2) ^ 1;
11001 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11002 // and BToA inputs. If there is also such a problem with the BToB and AToB
11003 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11004 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11005 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11006 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11007 // Compute how many inputs will be flipped by swapping these DWords. We
11009 // to balance this to ensure we don't form a 3-1 shuffle in the other
11011 int NumFlippedAToBInputs =
11012 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11013 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11014 int NumFlippedBToBInputs =
11015 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11016 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11017 if ((NumFlippedAToBInputs == 1 &&
11018 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11019 (NumFlippedBToBInputs == 1 &&
11020 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11021 // We choose whether to fix the A half or B half based on whether that
11022 // half has zero flipped inputs. At zero, we may not be able to fix it
11023 // with that half. We also bias towards fixing the B half because that
11024 // will more commonly be the high half, and we have to bias one way.
11025 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11026 ArrayRef<int> Inputs) {
11027 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11028 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11029 // Determine whether the free index is in the flipped dword or the
11030 // unflipped dword based on where the pinned index is. We use this bit
11031 // in an xor to conditionally select the adjacent dword.
11032 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11033 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11034 if (IsFixIdxInput == IsFixFreeIdxInput)
11036 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11037 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11038 "We need to be changing the number of flipped inputs!");
11039 int PSHUFHalfMask[] = {0, 1, 2, 3};
11040 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11042 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11043 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11044 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11046 for (int &M : Mask)
11047 if (M >= 0 && M == FixIdx)
11049 else if (M >= 0 && M == FixFreeIdx)
11052 if (NumFlippedBToBInputs != 0) {
11054 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11055 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11057 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11058 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11059 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11064 int PSHUFDMask[] = {0, 1, 2, 3};
11065 PSHUFDMask[ADWord] = BDWord;
11066 PSHUFDMask[BDWord] = ADWord;
11067 V = DAG.getBitcast(
11069 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11070 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11072 // Adjust the mask to match the new locations of A and B.
11073 for (int &M : Mask)
11074 if (M >= 0 && M/2 == ADWord)
11075 M = 2 * BDWord + M % 2;
11076 else if (M >= 0 && M/2 == BDWord)
11077 M = 2 * ADWord + M % 2;
11079 // Recurse back into this routine to re-compute state now that this isn't
11080 // a 3 and 1 problem.
11081 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11084 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11085 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11086 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11087 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11089 // At this point there are at most two inputs to the low and high halves from
11090 // each half. That means the inputs can always be grouped into dwords and
11091 // those dwords can then be moved to the correct half with a dword shuffle.
11092 // We use at most one low and one high word shuffle to collect these paired
11093 // inputs into dwords, and finally a dword shuffle to place them.
11094 int PSHUFLMask[4] = {-1, -1, -1, -1};
11095 int PSHUFHMask[4] = {-1, -1, -1, -1};
11096 int PSHUFDMask[4] = {-1, -1, -1, -1};
11098 // First fix the masks for all the inputs that are staying in their
11099 // original halves. This will then dictate the targets of the cross-half
11101 auto fixInPlaceInputs =
11102 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11103 MutableArrayRef<int> SourceHalfMask,
11104 MutableArrayRef<int> HalfMask, int HalfOffset) {
11105 if (InPlaceInputs.empty())
11107 if (InPlaceInputs.size() == 1) {
11108 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11109 InPlaceInputs[0] - HalfOffset;
11110 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11113 if (IncomingInputs.empty()) {
11114 // Just fix all of the in place inputs.
11115 for (int Input : InPlaceInputs) {
11116 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11117 PSHUFDMask[Input / 2] = Input / 2;
11122 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11123 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11124 InPlaceInputs[0] - HalfOffset;
11125 // Put the second input next to the first so that they are packed into
11126 // a dword. We find the adjacent index by toggling the low bit.
11127 int AdjIndex = InPlaceInputs[0] ^ 1;
11128 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11129 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11130 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11132 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11133 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11135 // Now gather the cross-half inputs and place them into a free dword of
11136 // their target half.
11137 // FIXME: This operation could almost certainly be simplified dramatically to
11138 // look more like the 3-1 fixing operation.
11139 auto moveInputsToRightHalf = [&PSHUFDMask](
11140 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11141 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11142 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11144 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11145 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11147 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11149 int LowWord = Word & ~1;
11150 int HighWord = Word | 1;
11151 return isWordClobbered(SourceHalfMask, LowWord) ||
11152 isWordClobbered(SourceHalfMask, HighWord);
11155 if (IncomingInputs.empty())
11158 if (ExistingInputs.empty()) {
11159 // Map any dwords with inputs from them into the right half.
11160 for (int Input : IncomingInputs) {
11161 // If the source half mask maps over the inputs, turn those into
11162 // swaps and use the swapped lane.
11163 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11164 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11165 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11166 Input - SourceOffset;
11167 // We have to swap the uses in our half mask in one sweep.
11168 for (int &M : HalfMask)
11169 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11171 else if (M == Input)
11172 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11174 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11175 Input - SourceOffset &&
11176 "Previous placement doesn't match!");
11178 // Note that this correctly re-maps both when we do a swap and when
11179 // we observe the other side of the swap above. We rely on that to
11180 // avoid swapping the members of the input list directly.
11181 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11184 // Map the input's dword into the correct half.
11185 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11186 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11188 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11190 "Previous placement doesn't match!");
11193 // And just directly shift any other-half mask elements to be same-half
11194 // as we will have mirrored the dword containing the element into the
11195 // same position within that half.
11196 for (int &M : HalfMask)
11197 if (M >= SourceOffset && M < SourceOffset + 4) {
11198 M = M - SourceOffset + DestOffset;
11199 assert(M >= 0 && "This should never wrap below zero!");
11204 // Ensure we have the input in a viable dword of its current half. This
11205 // is particularly tricky because the original position may be clobbered
11206 // by inputs being moved and *staying* in that half.
11207 if (IncomingInputs.size() == 1) {
11208 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11209 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11211 SourceHalfMask[InputFixed - SourceOffset] =
11212 IncomingInputs[0] - SourceOffset;
11213 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11215 IncomingInputs[0] = InputFixed;
11217 } else if (IncomingInputs.size() == 2) {
11218 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11219 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11220 // We have two non-adjacent or clobbered inputs we need to extract from
11221 // the source half. To do this, we need to map them into some adjacent
11222 // dword slot in the source mask.
11223 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11224 IncomingInputs[1] - SourceOffset};
11226 // If there is a free slot in the source half mask adjacent to one of
11227 // the inputs, place the other input in it. We use (Index XOR 1) to
11228 // compute an adjacent index.
11229 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11230 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11231 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11232 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11233 InputsFixed[1] = InputsFixed[0] ^ 1;
11234 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11235 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11236 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11237 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11238 InputsFixed[0] = InputsFixed[1] ^ 1;
11239 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11240 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11241 // The two inputs are in the same DWord but it is clobbered and the
11242 // adjacent DWord isn't used at all. Move both inputs to the free
11244 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11245 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11246 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11247 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11249 // The only way we hit this point is if there is no clobbering
11250 // (because there are no off-half inputs to this half) and there is no
11251 // free slot adjacent to one of the inputs. In this case, we have to
11252 // swap an input with a non-input.
11253 for (int i = 0; i < 4; ++i)
11254 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11255 "We can't handle any clobbers here!");
11256 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11257 "Cannot have adjacent inputs here!");
11259 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11260 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11262 // We also have to update the final source mask in this case because
11263 // it may need to undo the above swap.
11264 for (int &M : FinalSourceHalfMask)
11265 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11266 M = InputsFixed[1] + SourceOffset;
11267 else if (M == InputsFixed[1] + SourceOffset)
11268 M = (InputsFixed[0] ^ 1) + SourceOffset;
11270 InputsFixed[1] = InputsFixed[0] ^ 1;
11273 // Point everything at the fixed inputs.
11274 for (int &M : HalfMask)
11275 if (M == IncomingInputs[0])
11276 M = InputsFixed[0] + SourceOffset;
11277 else if (M == IncomingInputs[1])
11278 M = InputsFixed[1] + SourceOffset;
11280 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11281 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11284 llvm_unreachable("Unhandled input size!");
11287 // Now hoist the DWord down to the right half.
11288 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11289 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11290 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11291 for (int &M : HalfMask)
11292 for (int Input : IncomingInputs)
11294 M = FreeDWord * 2 + Input % 2;
11296 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11297 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11298 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11299 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11301 // Now enact all the shuffles we've computed to move the inputs into their
11303 if (!isNoopShuffleMask(PSHUFLMask))
11304 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11305 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11306 if (!isNoopShuffleMask(PSHUFHMask))
11307 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11308 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11309 if (!isNoopShuffleMask(PSHUFDMask))
11310 V = DAG.getBitcast(
11312 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11313 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11315 // At this point, each half should contain all its inputs, and we can then
11316 // just shuffle them into their final position.
11317 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11318 "Failed to lift all the high half inputs to the low mask!");
11319 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11320 "Failed to lift all the low half inputs to the high mask!");
11322 // Do a half shuffle for the low mask.
11323 if (!isNoopShuffleMask(LoMask))
11324 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11325 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11327 // Do a half shuffle with the high mask after shifting its values down.
11328 for (int &M : HiMask)
11331 if (!isNoopShuffleMask(HiMask))
11332 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11333 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11338 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11339 /// blend if only one input is used.
11340 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11341 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11342 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11344 SDValue V1Mask[16];
11345 SDValue V2Mask[16];
11349 int Size = Mask.size();
11350 int Scale = 16 / Size;
11351 for (int i = 0; i < 16; ++i) {
11352 if (Mask[i / Scale] < 0) {
11353 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11355 const int ZeroMask = 0x80;
11356 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11358 int V2Idx = Mask[i / Scale] < Size
11360 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11361 if (Zeroable[i / Scale])
11362 V1Idx = V2Idx = ZeroMask;
11363 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11364 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11365 V1InUse |= (ZeroMask != V1Idx);
11366 V2InUse |= (ZeroMask != V2Idx);
11371 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11372 DAG.getBitcast(MVT::v16i8, V1),
11373 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11375 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11376 DAG.getBitcast(MVT::v16i8, V2),
11377 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11379 // If we need shuffled inputs from both, blend the two.
11381 if (V1InUse && V2InUse)
11382 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11384 V = V1InUse ? V1 : V2;
11386 // Cast the result back to the correct type.
11387 return DAG.getBitcast(VT, V);
11390 /// \brief Generic lowering of 8-lane i16 shuffles.
11392 /// This handles both single-input shuffles and combined shuffle/blends with
11393 /// two inputs. The single input shuffles are immediately delegated to
11394 /// a dedicated lowering routine.
11396 /// The blends are lowered in one of three fundamental ways. If there are few
11397 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11398 /// of the input is significantly cheaper when lowered as an interleaving of
11399 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11400 /// halves of the inputs separately (making them have relatively few inputs)
11401 /// and then concatenate them.
11402 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11403 const APInt &Zeroable,
11404 SDValue V1, SDValue V2,
11405 const X86Subtarget &Subtarget,
11406 SelectionDAG &DAG) {
11407 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11408 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11409 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11411 // Whenever we can lower this as a zext, that instruction is strictly faster
11412 // than any alternative.
11413 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11414 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11417 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11419 if (NumV2Inputs == 0) {
11420 // Check for being able to broadcast a single element.
11421 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11422 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11425 // Try to use shift instructions.
11426 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11427 Zeroable, Subtarget, DAG))
11430 // Use dedicated unpack instructions for masks that match their pattern.
11432 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11435 // Try to use byte rotation instructions.
11436 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11437 Mask, Subtarget, DAG))
11440 // Make a copy of the mask so it can be modified.
11441 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11442 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11443 MutableMask, Subtarget,
11447 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11448 "All single-input shuffles should be canonicalized to be V1-input "
11451 // Try to use shift instructions.
11452 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11453 Zeroable, Subtarget, DAG))
11456 // See if we can use SSE4A Extraction / Insertion.
11457 if (Subtarget.hasSSE4A())
11458 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11462 // There are special ways we can lower some single-element blends.
11463 if (NumV2Inputs == 1)
11464 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11465 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11468 // We have different paths for blend lowering, but they all must use the
11469 // *exact* same predicate.
11470 bool IsBlendSupported = Subtarget.hasSSE41();
11471 if (IsBlendSupported)
11472 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11473 Zeroable, Subtarget, DAG))
11476 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11480 // Use dedicated unpack instructions for masks that match their pattern.
11482 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11485 // Try to use byte rotation instructions.
11486 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11487 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11490 if (SDValue BitBlend =
11491 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11494 // Try to lower by permuting the inputs into an unpack instruction.
11495 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11499 // If we can't directly blend but can use PSHUFB, that will be better as it
11500 // can both shuffle and set up the inefficient blend.
11501 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11502 bool V1InUse, V2InUse;
11503 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11504 Zeroable, DAG, V1InUse, V2InUse);
11507 // We can always bit-blend if we have to so the fallback strategy is to
11508 // decompose into single-input permutes and blends.
11509 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11513 /// \brief Check whether a compaction lowering can be done by dropping even
11514 /// elements and compute how many times even elements must be dropped.
11516 /// This handles shuffles which take every Nth element where N is a power of
11517 /// two. Example shuffle masks:
11519 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11520 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11521 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11522 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11523 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11524 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11526 /// Any of these lanes can of course be undef.
11528 /// This routine only supports N <= 3.
11529 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11532 /// \returns N above, or the number of times even elements must be dropped if
11533 /// there is such a number. Otherwise returns zero.
11534 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11535 bool IsSingleInput) {
11536 // The modulus for the shuffle vector entries is based on whether this is
11537 // a single input or not.
11538 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11539 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11540 "We should only be called with masks with a power-of-2 size!");
11542 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11544 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11545 // and 2^3 simultaneously. This is because we may have ambiguity with
11546 // partially undef inputs.
11547 bool ViableForN[3] = {true, true, true};
11549 for (int i = 0, e = Mask.size(); i < e; ++i) {
11550 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11555 bool IsAnyViable = false;
11556 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11557 if (ViableForN[j]) {
11558 uint64_t N = j + 1;
11560 // The shuffle mask must be equal to (i * 2^N) % M.
11561 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11562 IsAnyViable = true;
11564 ViableForN[j] = false;
11566 // Early exit if we exhaust the possible powers of two.
11571 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11575 // Return 0 as there is no viable power of two.
11579 /// \brief Generic lowering of v16i8 shuffles.
11581 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11582 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11583 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11584 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11586 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11587 const APInt &Zeroable,
11588 SDValue V1, SDValue V2,
11589 const X86Subtarget &Subtarget,
11590 SelectionDAG &DAG) {
11591 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11592 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11593 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11595 // Try to use shift instructions.
11596 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11597 Zeroable, Subtarget, DAG))
11600 // Try to use byte rotation instructions.
11601 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11602 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11605 // Try to use a zext lowering.
11606 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11607 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11610 // See if we can use SSE4A Extraction / Insertion.
11611 if (Subtarget.hasSSE4A())
11612 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11616 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11618 // For single-input shuffles, there are some nicer lowering tricks we can use.
11619 if (NumV2Elements == 0) {
11620 // Check for being able to broadcast a single element.
11621 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11622 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11625 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11626 // Notably, this handles splat and partial-splat shuffles more efficiently.
11627 // However, it only makes sense if the pre-duplication shuffle simplifies
11628 // things significantly. Currently, this means we need to be able to
11629 // express the pre-duplication shuffle as an i16 shuffle.
11631 // FIXME: We should check for other patterns which can be widened into an
11632 // i16 shuffle as well.
11633 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11634 for (int i = 0; i < 16; i += 2)
11635 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11640 auto tryToWidenViaDuplication = [&]() -> SDValue {
11641 if (!canWidenViaDuplication(Mask))
11643 SmallVector<int, 4> LoInputs;
11644 copy_if(Mask, std::back_inserter(LoInputs),
11645 [](int M) { return M >= 0 && M < 8; });
11646 std::sort(LoInputs.begin(), LoInputs.end());
11647 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11649 SmallVector<int, 4> HiInputs;
11650 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11651 std::sort(HiInputs.begin(), HiInputs.end());
11652 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11655 bool TargetLo = LoInputs.size() >= HiInputs.size();
11656 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11657 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11659 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11660 SmallDenseMap<int, int, 8> LaneMap;
11661 for (int I : InPlaceInputs) {
11662 PreDupI16Shuffle[I/2] = I/2;
11665 int j = TargetLo ? 0 : 4, je = j + 4;
11666 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11667 // Check if j is already a shuffle of this input. This happens when
11668 // there are two adjacent bytes after we move the low one.
11669 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11670 // If we haven't yet mapped the input, search for a slot into which
11672 while (j < je && PreDupI16Shuffle[j] >= 0)
11676 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11679 // Map this input with the i16 shuffle.
11680 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11683 // Update the lane map based on the mapping we ended up with.
11684 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11686 V1 = DAG.getBitcast(
11688 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11689 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11691 // Unpack the bytes to form the i16s that will be shuffled into place.
11692 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11693 MVT::v16i8, V1, V1);
11695 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11696 for (int i = 0; i < 16; ++i)
11697 if (Mask[i] >= 0) {
11698 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11699 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11700 if (PostDupI16Shuffle[i / 2] < 0)
11701 PostDupI16Shuffle[i / 2] = MappedMask;
11703 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11704 "Conflicting entries in the original shuffle!");
11706 return DAG.getBitcast(
11708 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11709 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11711 if (SDValue V = tryToWidenViaDuplication())
11715 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11719 // Use dedicated unpack instructions for masks that match their pattern.
11721 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11724 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11725 // with PSHUFB. It is important to do this before we attempt to generate any
11726 // blends but after all of the single-input lowerings. If the single input
11727 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11728 // want to preserve that and we can DAG combine any longer sequences into
11729 // a PSHUFB in the end. But once we start blending from multiple inputs,
11730 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11731 // and there are *very* few patterns that would actually be faster than the
11732 // PSHUFB approach because of its ability to zero lanes.
11734 // FIXME: The only exceptions to the above are blends which are exact
11735 // interleavings with direct instructions supporting them. We currently don't
11736 // handle those well here.
11737 if (Subtarget.hasSSSE3()) {
11738 bool V1InUse = false;
11739 bool V2InUse = false;
11741 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11742 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11744 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11745 // do so. This avoids using them to handle blends-with-zero which is
11746 // important as a single pshufb is significantly faster for that.
11747 if (V1InUse && V2InUse) {
11748 if (Subtarget.hasSSE41())
11749 if (SDValue Blend = lowerVectorShuffleAsBlend(
11750 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11753 // We can use an unpack to do the blending rather than an or in some
11754 // cases. Even though the or may be (very minorly) more efficient, we
11755 // preference this lowering because there are common cases where part of
11756 // the complexity of the shuffles goes away when we do the final blend as
11758 // FIXME: It might be worth trying to detect if the unpack-feeding
11759 // shuffles will both be pshufb, in which case we shouldn't bother with
11761 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11762 DL, MVT::v16i8, V1, V2, Mask, DAG))
11769 // There are special ways we can lower some single-element blends.
11770 if (NumV2Elements == 1)
11771 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11772 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11775 if (SDValue BitBlend =
11776 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11779 // Check whether a compaction lowering can be done. This handles shuffles
11780 // which take every Nth element for some even N. See the helper function for
11783 // We special case these as they can be particularly efficiently handled with
11784 // the PACKUSB instruction on x86 and they show up in common patterns of
11785 // rearranging bytes to truncate wide elements.
11786 bool IsSingleInput = V2.isUndef();
11787 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11788 // NumEvenDrops is the power of two stride of the elements. Another way of
11789 // thinking about it is that we need to drop the even elements this many
11790 // times to get the original input.
11792 // First we need to zero all the dropped bytes.
11793 assert(NumEvenDrops <= 3 &&
11794 "No support for dropping even elements more than 3 times.");
11795 // We use the mask type to pick which bytes are preserved based on how many
11796 // elements are dropped.
11797 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11798 SDValue ByteClearMask = DAG.getBitcast(
11799 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11800 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11801 if (!IsSingleInput)
11802 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11804 // Now pack things back together.
11805 V1 = DAG.getBitcast(MVT::v8i16, V1);
11806 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11807 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11808 for (int i = 1; i < NumEvenDrops; ++i) {
11809 Result = DAG.getBitcast(MVT::v8i16, Result);
11810 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11816 // Handle multi-input cases by blending single-input shuffles.
11817 if (NumV2Elements > 0)
11818 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11821 // The fallback path for single-input shuffles widens this into two v8i16
11822 // vectors with unpacks, shuffles those, and then pulls them back together
11826 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11827 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11828 for (int i = 0; i < 16; ++i)
11830 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11832 SDValue VLoHalf, VHiHalf;
11833 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11834 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11836 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11837 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11838 // Use a mask to drop the high bytes.
11839 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11840 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11841 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11843 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11844 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11846 // Squash the masks to point directly into VLoHalf.
11847 for (int &M : LoBlendMask)
11850 for (int &M : HiBlendMask)
11854 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11855 // VHiHalf so that we can blend them as i16s.
11856 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11858 VLoHalf = DAG.getBitcast(
11859 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11860 VHiHalf = DAG.getBitcast(
11861 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11864 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11865 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11867 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11870 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11872 /// This routine breaks down the specific type of 128-bit shuffle and
11873 /// dispatches to the lowering routines accordingly.
11874 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11875 MVT VT, SDValue V1, SDValue V2,
11876 const APInt &Zeroable,
11877 const X86Subtarget &Subtarget,
11878 SelectionDAG &DAG) {
11879 switch (VT.SimpleTy) {
11881 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11883 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11885 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11887 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11889 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11891 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11894 llvm_unreachable("Unimplemented!");
11898 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11900 /// This routine just extracts two subvectors, shuffles them independently, and
11901 /// then concatenates them back together. This should work effectively with all
11902 /// AVX vector shuffle types.
11903 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11904 SDValue V2, ArrayRef<int> Mask,
11905 SelectionDAG &DAG) {
11906 assert(VT.getSizeInBits() >= 256 &&
11907 "Only for 256-bit or wider vector shuffles!");
11908 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11909 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11911 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11912 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11914 int NumElements = VT.getVectorNumElements();
11915 int SplitNumElements = NumElements / 2;
11916 MVT ScalarVT = VT.getVectorElementType();
11917 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11919 // Rather than splitting build-vectors, just build two narrower build
11920 // vectors. This helps shuffling with splats and zeros.
11921 auto SplitVector = [&](SDValue V) {
11922 V = peekThroughBitcasts(V);
11924 MVT OrigVT = V.getSimpleValueType();
11925 int OrigNumElements = OrigVT.getVectorNumElements();
11926 int OrigSplitNumElements = OrigNumElements / 2;
11927 MVT OrigScalarVT = OrigVT.getVectorElementType();
11928 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11932 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11934 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11935 DAG.getIntPtrConstant(0, DL));
11936 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11937 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11940 SmallVector<SDValue, 16> LoOps, HiOps;
11941 for (int i = 0; i < OrigSplitNumElements; ++i) {
11942 LoOps.push_back(BV->getOperand(i));
11943 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11945 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11946 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11948 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11949 DAG.getBitcast(SplitVT, HiV));
11952 SDValue LoV1, HiV1, LoV2, HiV2;
11953 std::tie(LoV1, HiV1) = SplitVector(V1);
11954 std::tie(LoV2, HiV2) = SplitVector(V2);
11956 // Now create two 4-way blends of these half-width vectors.
11957 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11958 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11959 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11960 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11961 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11962 for (int i = 0; i < SplitNumElements; ++i) {
11963 int M = HalfMask[i];
11964 if (M >= NumElements) {
11965 if (M >= NumElements + SplitNumElements)
11969 V2BlendMask[i] = M - NumElements;
11970 BlendMask[i] = SplitNumElements + i;
11971 } else if (M >= 0) {
11972 if (M >= SplitNumElements)
11976 V1BlendMask[i] = M;
11981 // Because the lowering happens after all combining takes place, we need to
11982 // manually combine these blend masks as much as possible so that we create
11983 // a minimal number of high-level vector shuffle nodes.
11985 // First try just blending the halves of V1 or V2.
11986 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
11987 return DAG.getUNDEF(SplitVT);
11988 if (!UseLoV2 && !UseHiV2)
11989 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11990 if (!UseLoV1 && !UseHiV1)
11991 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
11993 SDValue V1Blend, V2Blend;
11994 if (UseLoV1 && UseHiV1) {
11996 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
11998 // We only use half of V1 so map the usage down into the final blend mask.
11999 V1Blend = UseLoV1 ? LoV1 : HiV1;
12000 for (int i = 0; i < SplitNumElements; ++i)
12001 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12002 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12004 if (UseLoV2 && UseHiV2) {
12006 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12008 // We only use half of V2 so map the usage down into the final blend mask.
12009 V2Blend = UseLoV2 ? LoV2 : HiV2;
12010 for (int i = 0; i < SplitNumElements; ++i)
12011 if (BlendMask[i] >= SplitNumElements)
12012 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12014 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12016 SDValue Lo = HalfBlend(LoMask);
12017 SDValue Hi = HalfBlend(HiMask);
12018 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12021 /// \brief Either split a vector in halves or decompose the shuffles and the
12024 /// This is provided as a good fallback for many lowerings of non-single-input
12025 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12026 /// between splitting the shuffle into 128-bit components and stitching those
12027 /// back together vs. extracting the single-input shuffles and blending those
12029 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12030 SDValue V1, SDValue V2,
12031 ArrayRef<int> Mask,
12032 SelectionDAG &DAG) {
12033 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12034 "shuffles as it could then recurse on itself.");
12035 int Size = Mask.size();
12037 // If this can be modeled as a broadcast of two elements followed by a blend,
12038 // prefer that lowering. This is especially important because broadcasts can
12039 // often fold with memory operands.
12040 auto DoBothBroadcast = [&] {
12041 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12044 if (V2BroadcastIdx < 0)
12045 V2BroadcastIdx = M - Size;
12046 else if (M - Size != V2BroadcastIdx)
12048 } else if (M >= 0) {
12049 if (V1BroadcastIdx < 0)
12050 V1BroadcastIdx = M;
12051 else if (M != V1BroadcastIdx)
12056 if (DoBothBroadcast())
12057 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12060 // If the inputs all stem from a single 128-bit lane of each input, then we
12061 // split them rather than blending because the split will decompose to
12062 // unusually few instructions.
12063 int LaneCount = VT.getSizeInBits() / 128;
12064 int LaneSize = Size / LaneCount;
12065 SmallBitVector LaneInputs[2];
12066 LaneInputs[0].resize(LaneCount, false);
12067 LaneInputs[1].resize(LaneCount, false);
12068 for (int i = 0; i < Size; ++i)
12070 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12071 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12072 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12074 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12075 // that the decomposed single-input shuffles don't end up here.
12076 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12079 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12080 /// a permutation and blend of those lanes.
12082 /// This essentially blends the out-of-lane inputs to each lane into the lane
12083 /// from a permuted copy of the vector. This lowering strategy results in four
12084 /// instructions in the worst case for a single-input cross lane shuffle which
12085 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12086 /// of. Special cases for each particular shuffle pattern should be handled
12087 /// prior to trying this lowering.
12088 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12089 SDValue V1, SDValue V2,
12090 ArrayRef<int> Mask,
12091 SelectionDAG &DAG) {
12092 // FIXME: This should probably be generalized for 512-bit vectors as well.
12093 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12094 int Size = Mask.size();
12095 int LaneSize = Size / 2;
12097 // If there are only inputs from one 128-bit lane, splitting will in fact be
12098 // less expensive. The flags track whether the given lane contains an element
12099 // that crosses to another lane.
12100 bool LaneCrossing[2] = {false, false};
12101 for (int i = 0; i < Size; ++i)
12102 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12103 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12104 if (!LaneCrossing[0] || !LaneCrossing[1])
12105 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12107 assert(V2.isUndef() &&
12108 "This last part of this routine only works on single input shuffles");
12110 SmallVector<int, 32> FlippedBlendMask(Size);
12111 for (int i = 0; i < Size; ++i)
12112 FlippedBlendMask[i] =
12113 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12115 : Mask[i] % LaneSize +
12116 (i / LaneSize) * LaneSize + Size);
12118 // Flip the vector, and blend the results which should now be in-lane. The
12119 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
12120 // 5 for the high source. The value 3 selects the high half of source 2 and
12121 // the value 2 selects the low half of source 2. We only use source 2 to
12122 // allow folding it into a memory operand.
12123 unsigned PERMMask = 3 | 2 << 4;
12124 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12125 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12126 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12129 /// \brief Handle lowering 2-lane 128-bit shuffles.
12130 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12131 SDValue V2, ArrayRef<int> Mask,
12132 const APInt &Zeroable,
12133 const X86Subtarget &Subtarget,
12134 SelectionDAG &DAG) {
12135 SmallVector<int, 4> WidenedMask;
12136 if (!canWidenShuffleElements(Mask, WidenedMask))
12139 // TODO: If minimizing size and one of the inputs is a zero vector and the
12140 // the zero vector has only one use, we could use a VPERM2X128 to save the
12141 // instruction bytes needed to explicitly generate the zero vector.
12143 // Blends are faster and handle all the non-lane-crossing cases.
12144 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12145 Zeroable, Subtarget, DAG))
12148 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12149 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12151 // If either input operand is a zero vector, use VPERM2X128 because its mask
12152 // allows us to replace the zero input with an implicit zero.
12153 if (!IsV1Zero && !IsV2Zero) {
12154 // Check for patterns which can be matched with a single insert of a 128-bit
12156 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12157 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12158 // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
12159 if (Subtarget.hasAVX2() && V2.isUndef())
12162 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12163 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12164 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12165 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12166 VT.getVectorNumElements() / 2);
12167 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12168 DAG.getIntPtrConstant(0, DL));
12169 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12170 OnlyUsesV1 ? V1 : V2,
12171 DAG.getIntPtrConstant(0, DL));
12172 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12177 // Otherwise form a 128-bit permutation. After accounting for undefs,
12178 // convert the 64-bit shuffle mask selection values into 128-bit
12179 // selection bits by dividing the indexes by 2 and shifting into positions
12180 // defined by a vperm2*128 instruction's immediate control byte.
12182 // The immediate permute control byte looks like this:
12183 // [1:0] - select 128 bits from sources for low half of destination
12185 // [3] - zero low half of destination
12186 // [5:4] - select 128 bits from sources for high half of destination
12188 // [7] - zero high half of destination
12190 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12191 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12193 unsigned PermMask = MaskLO | (MaskHI << 4);
12195 // If either input is a zero vector, replace it with an undef input.
12196 // Shuffle mask values < 4 are selecting elements of V1.
12197 // Shuffle mask values >= 4 are selecting elements of V2.
12198 // Adjust each half of the permute mask by clearing the half that was
12199 // selecting the zero vector and setting the zero mask bit.
12201 V1 = DAG.getUNDEF(VT);
12203 PermMask = (PermMask & 0xf0) | 0x08;
12205 PermMask = (PermMask & 0x0f) | 0x80;
12208 V2 = DAG.getUNDEF(VT);
12210 PermMask = (PermMask & 0xf0) | 0x08;
12212 PermMask = (PermMask & 0x0f) | 0x80;
12215 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12216 DAG.getConstant(PermMask, DL, MVT::i8));
12219 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12220 /// shuffling each lane.
12222 /// This will only succeed when the result of fixing the 128-bit lanes results
12223 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12224 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12225 /// the lane crosses early and then use simpler shuffles within each lane.
12227 /// FIXME: It might be worthwhile at some point to support this without
12228 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12229 /// in x86 only floating point has interesting non-repeating shuffles, and even
12230 /// those are still *marginally* more expensive.
12231 static SDValue lowerVectorShuffleByMerging128BitLanes(
12232 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12233 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12234 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12236 int Size = Mask.size();
12237 int LaneSize = 128 / VT.getScalarSizeInBits();
12238 int NumLanes = Size / LaneSize;
12239 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12241 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12242 // check whether the in-128-bit lane shuffles share a repeating pattern.
12243 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12244 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12245 for (int i = 0; i < Size; ++i) {
12249 int j = i / LaneSize;
12251 if (Lanes[j] < 0) {
12252 // First entry we've seen for this lane.
12253 Lanes[j] = Mask[i] / LaneSize;
12254 } else if (Lanes[j] != Mask[i] / LaneSize) {
12255 // This doesn't match the lane selected previously!
12259 // Check that within each lane we have a consistent shuffle mask.
12260 int k = i % LaneSize;
12261 if (InLaneMask[k] < 0) {
12262 InLaneMask[k] = Mask[i] % LaneSize;
12263 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12264 // This doesn't fit a repeating in-lane mask.
12269 // First shuffle the lanes into place.
12270 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12271 VT.getSizeInBits() / 64);
12272 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12273 for (int i = 0; i < NumLanes; ++i)
12274 if (Lanes[i] >= 0) {
12275 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12276 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12279 V1 = DAG.getBitcast(LaneVT, V1);
12280 V2 = DAG.getBitcast(LaneVT, V2);
12281 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12283 // Cast it back to the type we actually want.
12284 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12286 // Now do a simple shuffle that isn't lane crossing.
12287 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12288 for (int i = 0; i < Size; ++i)
12290 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12291 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12292 "Must not introduce lane crosses at this point!");
12294 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12297 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12298 /// This allows for fast cases such as subvector extraction/insertion
12299 /// or shuffling smaller vector types which can lower more efficiently.
12300 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12301 SDValue V1, SDValue V2,
12302 ArrayRef<int> Mask,
12303 const X86Subtarget &Subtarget,
12304 SelectionDAG &DAG) {
12305 assert(VT.is256BitVector() && "Expected 256-bit vector");
12307 unsigned NumElts = VT.getVectorNumElements();
12308 unsigned HalfNumElts = NumElts / 2;
12309 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12311 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12312 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12313 if (!UndefLower && !UndefUpper)
12316 // Upper half is undef and lower half is whole upper subvector.
12317 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12319 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12320 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12321 DAG.getIntPtrConstant(HalfNumElts, DL));
12322 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12323 DAG.getIntPtrConstant(0, DL));
12326 // Lower half is undef and upper half is whole lower subvector.
12327 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12329 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12330 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12331 DAG.getIntPtrConstant(0, DL));
12332 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12333 DAG.getIntPtrConstant(HalfNumElts, DL));
12336 // If the shuffle only uses two of the four halves of the input operands,
12337 // then extract them and perform the 'half' shuffle at half width.
12338 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12339 int HalfIdx1 = -1, HalfIdx2 = -1;
12340 SmallVector<int, 8> HalfMask(HalfNumElts);
12341 unsigned Offset = UndefLower ? HalfNumElts : 0;
12342 for (unsigned i = 0; i != HalfNumElts; ++i) {
12343 int M = Mask[i + Offset];
12349 // Determine which of the 4 half vectors this element is from.
12350 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12351 int HalfIdx = M / HalfNumElts;
12353 // Determine the element index into its half vector source.
12354 int HalfElt = M % HalfNumElts;
12356 // We can shuffle with up to 2 half vectors, set the new 'half'
12357 // shuffle mask accordingly.
12358 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12359 HalfMask[i] = HalfElt;
12360 HalfIdx1 = HalfIdx;
12363 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12364 HalfMask[i] = HalfElt + HalfNumElts;
12365 HalfIdx2 = HalfIdx;
12369 // Too many half vectors referenced.
12372 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12374 // Only shuffle the halves of the inputs when useful.
12375 int NumLowerHalves =
12376 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12377 int NumUpperHalves =
12378 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12380 // uuuuXXXX - don't extract uppers just to insert again.
12381 if (UndefLower && NumUpperHalves != 0)
12384 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12385 if (UndefUpper && NumUpperHalves == 2)
12388 // AVX2 - XXXXuuuu - always extract lowers.
12389 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12390 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12391 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12393 // AVX2 supports variable 32-bit element cross-lane shuffles.
12394 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12395 // XXXXuuuu - don't extract lowers and uppers.
12396 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12401 auto GetHalfVector = [&](int HalfIdx) {
12403 return DAG.getUNDEF(HalfVT);
12404 SDValue V = (HalfIdx < 2 ? V1 : V2);
12405 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12406 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12407 DAG.getIntPtrConstant(HalfIdx, DL));
12410 SDValue Half1 = GetHalfVector(HalfIdx1);
12411 SDValue Half2 = GetHalfVector(HalfIdx2);
12412 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12413 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12414 DAG.getIntPtrConstant(Offset, DL));
12417 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12420 /// This returns true if the elements from a particular input are already in the
12421 /// slot required by the given mask and require no permutation.
12422 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12423 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12424 int Size = Mask.size();
12425 for (int i = 0; i < Size; ++i)
12426 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12432 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12433 /// every lane can be represented as the same repeating mask - allowing us to
12434 /// shuffle the sources with the repeating shuffle and then permute the result
12435 /// to the destination lanes.
12436 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12437 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12438 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12439 int NumElts = VT.getVectorNumElements();
12440 int NumLanes = VT.getSizeInBits() / 128;
12441 int NumLaneElts = NumElts / NumLanes;
12443 // On AVX2 we may be able to just shuffle the lowest elements and then
12444 // broadcast the result.
12445 if (Subtarget.hasAVX2()) {
12446 for (unsigned BroadcastSize : {16, 32, 64}) {
12447 if (BroadcastSize <= VT.getScalarSizeInBits())
12449 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12451 // Attempt to match a repeating pattern every NumBroadcastElts,
12452 // accounting for UNDEFs but only references the lowest 128-bit
12453 // lane of the inputs.
12454 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12455 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12456 for (int j = 0; j != NumBroadcastElts; ++j) {
12457 int M = Mask[i + j];
12460 int &R = RepeatMask[j];
12461 if (0 != ((M % NumElts) / NumLaneElts))
12463 if (0 <= R && R != M)
12470 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12471 if (!FindRepeatingBroadcastMask(RepeatMask))
12474 // Shuffle the (lowest) repeated elements in place for broadcast.
12475 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12477 // Shuffle the actual broadcast.
12478 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12479 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12480 for (int j = 0; j != NumBroadcastElts; ++j)
12481 BroadcastMask[i + j] = j;
12482 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12487 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12488 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12491 // Bail if we already have a repeated lane shuffle mask.
12492 SmallVector<int, 8> RepeatedShuffleMask;
12493 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12496 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12497 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12498 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12499 int NumSubLanes = NumLanes * SubLaneScale;
12500 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12502 // Check that all the sources are coming from the same lane and see if we can
12503 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12504 // determine the source sub-lane for each destination sub-lane.
12505 int TopSrcSubLane = -1;
12506 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12507 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12508 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12509 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12511 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12512 // Extract the sub-lane mask, check that it all comes from the same lane
12513 // and normalize the mask entries to come from the first lane.
12515 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12516 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12517 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12520 int Lane = (M % NumElts) / NumLaneElts;
12521 if ((0 <= SrcLane) && (SrcLane != Lane))
12524 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12525 SubLaneMask[Elt] = LocalM;
12528 // Whole sub-lane is UNDEF.
12532 // Attempt to match against the candidate repeated sub-lane masks.
12533 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12534 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12535 for (int i = 0; i != NumSubLaneElts; ++i) {
12536 if (M1[i] < 0 || M2[i] < 0)
12538 if (M1[i] != M2[i])
12544 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12545 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12548 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12549 for (int i = 0; i != NumSubLaneElts; ++i) {
12550 int M = SubLaneMask[i];
12553 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12554 "Unexpected mask element");
12555 RepeatedSubLaneMask[i] = M;
12558 // Track the top most source sub-lane - by setting the remaining to UNDEF
12559 // we can greatly simplify shuffle matching.
12560 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12561 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12562 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12566 // Bail if we failed to find a matching repeated sub-lane mask.
12567 if (Dst2SrcSubLanes[DstSubLane] < 0)
12570 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12571 "Unexpected source lane");
12573 // Create a repeating shuffle mask for the entire vector.
12574 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12575 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12576 int Lane = SubLane / SubLaneScale;
12577 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12578 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12579 int M = RepeatedSubLaneMask[Elt];
12582 int Idx = (SubLane * NumSubLaneElts) + Elt;
12583 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12586 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12588 // Shuffle each source sub-lane to its destination.
12589 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12590 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12591 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12592 if (SrcSubLane < 0)
12594 for (int j = 0; j != NumSubLaneElts; ++j)
12595 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12598 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12602 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12603 unsigned &ShuffleImm,
12604 ArrayRef<int> Mask) {
12605 int NumElts = VT.getVectorNumElements();
12606 assert(VT.getScalarSizeInBits() == 64 &&
12607 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12608 "Unexpected data type for VSHUFPD");
12610 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12611 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12613 bool ShufpdMask = true;
12614 bool CommutableMask = true;
12615 for (int i = 0; i < NumElts; ++i) {
12616 if (Mask[i] == SM_SentinelUndef)
12620 int Val = (i & 6) + NumElts * (i & 1);
12621 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12622 if (Mask[i] < Val || Mask[i] > Val + 1)
12623 ShufpdMask = false;
12624 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12625 CommutableMask = false;
12626 ShuffleImm |= (Mask[i] % 2) << i;
12631 if (CommutableMask) {
12639 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12640 ArrayRef<int> Mask, SDValue V1,
12641 SDValue V2, SelectionDAG &DAG) {
12642 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12643 "Unexpected data type for VSHUFPD");
12645 unsigned Immediate = 0;
12646 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12649 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12650 DAG.getConstant(Immediate, DL, MVT::i8));
12653 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12654 ArrayRef<int> Mask, SDValue V1,
12655 SDValue V2, SelectionDAG &DAG) {
12656 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12657 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12659 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12661 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12663 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12666 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12668 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12669 /// isn't available.
12670 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12671 const APInt &Zeroable,
12672 SDValue V1, SDValue V2,
12673 const X86Subtarget &Subtarget,
12674 SelectionDAG &DAG) {
12675 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12676 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12677 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12679 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12680 Zeroable, Subtarget, DAG))
12683 if (V2.isUndef()) {
12684 // Check for being able to broadcast a single element.
12685 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12686 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12689 // Use low duplicate instructions for masks that match their pattern.
12690 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12691 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12693 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12694 // Non-half-crossing single input shuffles can be lowered with an
12695 // interleaved permutation.
12696 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12697 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12698 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12699 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12702 // With AVX2 we have direct support for this permutation.
12703 if (Subtarget.hasAVX2())
12704 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12705 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12707 // Try to create an in-lane repeating shuffle mask and then shuffle the
12708 // the results into the target lanes.
12709 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12710 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12713 // Otherwise, fall back.
12714 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12718 // Use dedicated unpack instructions for masks that match their pattern.
12720 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12723 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12724 Zeroable, Subtarget, DAG))
12727 // Check if the blend happens to exactly fit that of SHUFPD.
12729 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12732 // Try to create an in-lane repeating shuffle mask and then shuffle the
12733 // the results into the target lanes.
12734 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12735 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12738 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12739 // shuffle. However, if we have AVX2 and either inputs are already in place,
12740 // we will be able to shuffle even across lanes the other input in a single
12741 // instruction so skip this pattern.
12742 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12743 isShuffleMaskInputInPlace(1, Mask))))
12744 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12745 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12747 // If we have VLX support, we can use VEXPAND.
12748 if (Subtarget.hasVLX())
12749 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12750 V1, V2, DAG, Subtarget))
12753 // If we have AVX2 then we always want to lower with a blend because an v4 we
12754 // can fully permute the elements.
12755 if (Subtarget.hasAVX2())
12756 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12759 // Otherwise fall back on generic lowering.
12760 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12763 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12765 /// This routine is only called when we have AVX2 and thus a reasonable
12766 /// instruction set for v4i64 shuffling..
12767 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12768 const APInt &Zeroable,
12769 SDValue V1, SDValue V2,
12770 const X86Subtarget &Subtarget,
12771 SelectionDAG &DAG) {
12772 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12773 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12774 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12775 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12777 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12778 Zeroable, Subtarget, DAG))
12781 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12782 Zeroable, Subtarget, DAG))
12785 // Check for being able to broadcast a single element.
12786 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12787 Mask, Subtarget, DAG))
12790 if (V2.isUndef()) {
12791 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12792 // can use lower latency instructions that will operate on both lanes.
12793 SmallVector<int, 2> RepeatedMask;
12794 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12795 SmallVector<int, 4> PSHUFDMask;
12796 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12797 return DAG.getBitcast(
12799 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12800 DAG.getBitcast(MVT::v8i32, V1),
12801 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12804 // AVX2 provides a direct instruction for permuting a single input across
12806 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12807 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12810 // Try to use shift instructions.
12811 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12812 Zeroable, Subtarget, DAG))
12815 // If we have VLX support, we can use VALIGN or VEXPAND.
12816 if (Subtarget.hasVLX()) {
12817 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12818 Mask, Subtarget, DAG))
12821 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12822 V1, V2, DAG, Subtarget))
12826 // Try to use PALIGNR.
12827 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12828 Mask, Subtarget, DAG))
12831 // Use dedicated unpack instructions for masks that match their pattern.
12833 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12836 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12837 // shuffle. However, if we have AVX2 and either inputs are already in place,
12838 // we will be able to shuffle even across lanes the other input in a single
12839 // instruction so skip this pattern.
12840 if (!isShuffleMaskInputInPlace(0, Mask) &&
12841 !isShuffleMaskInputInPlace(1, Mask))
12842 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12843 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12846 // Otherwise fall back on generic blend lowering.
12847 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12851 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12853 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12854 /// isn't available.
12855 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12856 const APInt &Zeroable,
12857 SDValue V1, SDValue V2,
12858 const X86Subtarget &Subtarget,
12859 SelectionDAG &DAG) {
12860 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12861 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12862 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12864 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12865 Zeroable, Subtarget, DAG))
12868 // Check for being able to broadcast a single element.
12869 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12870 Mask, Subtarget, DAG))
12873 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12874 // options to efficiently lower the shuffle.
12875 SmallVector<int, 4> RepeatedMask;
12876 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12877 assert(RepeatedMask.size() == 4 &&
12878 "Repeated masks must be half the mask width!");
12880 // Use even/odd duplicate instructions for masks that match their pattern.
12881 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12882 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12883 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12884 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12887 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12888 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12890 // Use dedicated unpack instructions for masks that match their pattern.
12892 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12895 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12896 // have already handled any direct blends.
12897 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12900 // Try to create an in-lane repeating shuffle mask and then shuffle the
12901 // the results into the target lanes.
12902 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12903 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12906 // If we have a single input shuffle with different shuffle patterns in the
12907 // two 128-bit lanes use the variable mask to VPERMILPS.
12908 if (V2.isUndef()) {
12909 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12910 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12911 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12913 if (Subtarget.hasAVX2())
12914 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12916 // Otherwise, fall back.
12917 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12921 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12923 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12924 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12926 // If we have VLX support, we can use VEXPAND.
12927 if (Subtarget.hasVLX())
12928 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12929 V1, V2, DAG, Subtarget))
12932 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12933 // since after split we get a more efficient code using vpunpcklwd and
12934 // vpunpckhwd instrs than vblend.
12935 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12936 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12940 // If we have AVX2 then we always want to lower with a blend because at v8 we
12941 // can fully permute the elements.
12942 if (Subtarget.hasAVX2())
12943 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12946 // Otherwise fall back on generic lowering.
12947 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12950 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12952 /// This routine is only called when we have AVX2 and thus a reasonable
12953 /// instruction set for v8i32 shuffling..
12954 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12955 const APInt &Zeroable,
12956 SDValue V1, SDValue V2,
12957 const X86Subtarget &Subtarget,
12958 SelectionDAG &DAG) {
12959 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12960 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12961 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12962 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12964 // Whenever we can lower this as a zext, that instruction is strictly faster
12965 // than any alternative. It also allows us to fold memory operands into the
12966 // shuffle in many cases.
12967 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12968 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12971 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12972 // since after split we get a more efficient code than vblend by using
12973 // vpunpcklwd and vpunpckhwd instrs.
12974 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12975 !Subtarget.hasAVX512())
12977 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12980 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12981 Zeroable, Subtarget, DAG))
12984 // Check for being able to broadcast a single element.
12985 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
12986 Mask, Subtarget, DAG))
12989 // If the shuffle mask is repeated in each 128-bit lane we can use more
12990 // efficient instructions that mirror the shuffles across the two 128-bit
12992 SmallVector<int, 4> RepeatedMask;
12993 bool Is128BitLaneRepeatedShuffle =
12994 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
12995 if (Is128BitLaneRepeatedShuffle) {
12996 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
12998 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
12999 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13001 // Use dedicated unpack instructions for masks that match their pattern.
13003 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13007 // Try to use shift instructions.
13008 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13009 Zeroable, Subtarget, DAG))
13012 // If we have VLX support, we can use VALIGN or EXPAND.
13013 if (Subtarget.hasVLX()) {
13014 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13015 Mask, Subtarget, DAG))
13018 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13019 V1, V2, DAG, Subtarget))
13023 // Try to use byte rotation instructions.
13024 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13025 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13028 // Try to create an in-lane repeating shuffle mask and then shuffle the
13029 // results into the target lanes.
13030 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13031 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13034 // If the shuffle patterns aren't repeated but it is a single input, directly
13035 // generate a cross-lane VPERMD instruction.
13036 if (V2.isUndef()) {
13037 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13038 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13041 // Assume that a single SHUFPS is faster than an alternative sequence of
13042 // multiple instructions (even if the CPU has a domain penalty).
13043 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13044 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13045 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13046 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13047 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13048 CastV1, CastV2, DAG);
13049 return DAG.getBitcast(MVT::v8i32, ShufPS);
13052 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13054 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13055 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13058 // Otherwise fall back on generic blend lowering.
13059 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13063 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13065 /// This routine is only called when we have AVX2 and thus a reasonable
13066 /// instruction set for v16i16 shuffling..
13067 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13068 const APInt &Zeroable,
13069 SDValue V1, SDValue V2,
13070 const X86Subtarget &Subtarget,
13071 SelectionDAG &DAG) {
13072 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13073 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13074 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13075 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13077 // Whenever we can lower this as a zext, that instruction is strictly faster
13078 // than any alternative. It also allows us to fold memory operands into the
13079 // shuffle in many cases.
13080 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13081 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13084 // Check for being able to broadcast a single element.
13085 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13086 Mask, Subtarget, DAG))
13089 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13090 Zeroable, Subtarget, DAG))
13093 // Use dedicated unpack instructions for masks that match their pattern.
13095 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13098 // Try to use shift instructions.
13099 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13100 Zeroable, Subtarget, DAG))
13103 // Try to use byte rotation instructions.
13104 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13105 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13108 // Try to create an in-lane repeating shuffle mask and then shuffle the
13109 // the results into the target lanes.
13110 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13111 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13114 if (V2.isUndef()) {
13115 // There are no generalized cross-lane shuffle operations available on i16
13117 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13118 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13121 SmallVector<int, 8> RepeatedMask;
13122 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13123 // As this is a single-input shuffle, the repeated mask should be
13124 // a strictly valid v8i16 mask that we can pass through to the v8i16
13125 // lowering to handle even the v16 case.
13126 return lowerV8I16GeneralSingleInputVectorShuffle(
13127 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13131 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13132 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13135 // AVX512BWVL can lower to VPERMW.
13136 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13137 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13139 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13141 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13142 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13145 // Otherwise fall back on generic lowering.
13146 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13149 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13151 /// This routine is only called when we have AVX2 and thus a reasonable
13152 /// instruction set for v32i8 shuffling..
13153 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13154 const APInt &Zeroable,
13155 SDValue V1, SDValue V2,
13156 const X86Subtarget &Subtarget,
13157 SelectionDAG &DAG) {
13158 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13159 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13160 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13161 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13163 // Whenever we can lower this as a zext, that instruction is strictly faster
13164 // than any alternative. It also allows us to fold memory operands into the
13165 // shuffle in many cases.
13166 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13167 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13170 // Check for being able to broadcast a single element.
13171 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13172 Mask, Subtarget, DAG))
13175 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13176 Zeroable, Subtarget, DAG))
13179 // Use dedicated unpack instructions for masks that match their pattern.
13181 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13184 // Try to use shift instructions.
13185 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13186 Zeroable, Subtarget, DAG))
13189 // Try to use byte rotation instructions.
13190 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13191 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13194 // Try to create an in-lane repeating shuffle mask and then shuffle the
13195 // the results into the target lanes.
13196 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13197 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13200 // There are no generalized cross-lane shuffle operations available on i8
13202 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13203 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13206 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13207 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13210 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13212 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13213 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13216 // Otherwise fall back on generic lowering.
13217 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13220 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13222 /// This routine either breaks down the specific type of a 256-bit x86 vector
13223 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13224 /// together based on the available instructions.
13225 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13226 MVT VT, SDValue V1, SDValue V2,
13227 const APInt &Zeroable,
13228 const X86Subtarget &Subtarget,
13229 SelectionDAG &DAG) {
13230 // If we have a single input to the zero element, insert that into V1 if we
13231 // can do so cheaply.
13232 int NumElts = VT.getVectorNumElements();
13233 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13235 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13236 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13237 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13240 // Handle special cases where the lower or upper half is UNDEF.
13242 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13245 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13246 // can check for those subtargets here and avoid much of the subtarget
13247 // querying in the per-vector-type lowering routines. With AVX1 we have
13248 // essentially *zero* ability to manipulate a 256-bit vector with integer
13249 // types. Since we'll use floating point types there eventually, just
13250 // immediately cast everything to a float and operate entirely in that domain.
13251 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13252 int ElementBits = VT.getScalarSizeInBits();
13253 if (ElementBits < 32) {
13254 // No floating point type available, if we can't use the bit operations
13255 // for masking/blending then decompose into 128-bit vectors.
13257 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13259 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13261 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13264 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13265 VT.getVectorNumElements());
13266 V1 = DAG.getBitcast(FpVT, V1);
13267 V2 = DAG.getBitcast(FpVT, V2);
13268 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13271 switch (VT.SimpleTy) {
13273 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13275 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13277 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13279 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13281 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13283 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13286 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13290 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13291 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13292 ArrayRef<int> Mask, SDValue V1,
13293 SDValue V2, SelectionDAG &DAG) {
13294 assert(VT.getScalarSizeInBits() == 64 &&
13295 "Unexpected element type size for 128bit shuffle.");
13297 // To handle 256 bit vector requires VLX and most probably
13298 // function lowerV2X128VectorShuffle() is better solution.
13299 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13301 SmallVector<int, 4> WidenedMask;
13302 if (!canWidenShuffleElements(Mask, WidenedMask))
13305 // Check for patterns which can be matched with a single insert of a 256-bit
13307 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13308 {0, 1, 2, 3, 0, 1, 2, 3});
13309 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13310 {0, 1, 2, 3, 8, 9, 10, 11})) {
13311 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13312 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13313 DAG.getIntPtrConstant(0, DL));
13314 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13315 OnlyUsesV1 ? V1 : V2,
13316 DAG.getIntPtrConstant(0, DL));
13317 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13320 assert(WidenedMask.size() == 4);
13322 // See if this is an insertion of the lower 128-bits of V2 into V1.
13323 bool IsInsert = true;
13325 for (int i = 0; i < 4; ++i) {
13326 assert(WidenedMask[i] >= -1);
13327 if (WidenedMask[i] < 0)
13330 // Make sure all V1 subvectors are in place.
13331 if (WidenedMask[i] < 4) {
13332 if (WidenedMask[i] != i) {
13337 // Make sure we only have a single V2 index and its the lowest 128-bits.
13338 if (V2Index >= 0 || WidenedMask[i] != 4) {
13345 if (IsInsert && V2Index >= 0) {
13346 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13347 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13348 DAG.getIntPtrConstant(0, DL));
13349 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13352 // Try to lower to to vshuf64x2/vshuf32x4.
13353 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13354 unsigned PermMask = 0;
13355 // Insure elements came from the same Op.
13356 for (int i = 0; i < 4; ++i) {
13357 assert(WidenedMask[i] >= -1);
13358 if (WidenedMask[i] < 0)
13361 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13362 unsigned OpIndex = i / 2;
13363 if (Ops[OpIndex].isUndef())
13365 else if (Ops[OpIndex] != Op)
13368 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13369 // bits defined by a vshuf64x2 instruction's immediate control byte.
13370 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13373 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13374 DAG.getConstant(PermMask, DL, MVT::i8));
13377 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13378 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13379 const APInt &Zeroable,
13380 SDValue V1, SDValue V2,
13381 const X86Subtarget &Subtarget,
13382 SelectionDAG &DAG) {
13383 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13384 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13385 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13387 if (V2.isUndef()) {
13388 // Use low duplicate instructions for masks that match their pattern.
13389 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13390 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13392 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13393 // Non-half-crossing single input shuffles can be lowered with an
13394 // interleaved permutation.
13395 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13396 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13397 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13398 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13399 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13400 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13403 SmallVector<int, 4> RepeatedMask;
13404 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13405 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13406 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13409 if (SDValue Shuf128 =
13410 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13413 if (SDValue Unpck =
13414 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13417 // Check if the blend happens to exactly fit that of SHUFPD.
13419 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13422 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13423 V2, DAG, Subtarget))
13426 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13427 Zeroable, Subtarget, DAG))
13430 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13433 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13434 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13435 const APInt &Zeroable,
13436 SDValue V1, SDValue V2,
13437 const X86Subtarget &Subtarget,
13438 SelectionDAG &DAG) {
13439 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13440 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13441 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13443 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13444 // options to efficiently lower the shuffle.
13445 SmallVector<int, 4> RepeatedMask;
13446 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13447 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13449 // Use even/odd duplicate instructions for masks that match their pattern.
13450 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13451 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13452 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13453 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13456 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13457 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13459 // Use dedicated unpack instructions for masks that match their pattern.
13460 if (SDValue Unpck =
13461 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13464 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13465 Zeroable, Subtarget, DAG))
13468 // Otherwise, fall back to a SHUFPS sequence.
13469 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13471 // If we have AVX512F support, we can use VEXPAND.
13472 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13473 V1, V2, DAG, Subtarget))
13476 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13479 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13480 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13481 const APInt &Zeroable,
13482 SDValue V1, SDValue V2,
13483 const X86Subtarget &Subtarget,
13484 SelectionDAG &DAG) {
13485 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13486 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13487 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13489 if (SDValue Shuf128 =
13490 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13493 if (V2.isUndef()) {
13494 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13495 // can use lower latency instructions that will operate on all four
13497 SmallVector<int, 2> Repeated128Mask;
13498 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13499 SmallVector<int, 4> PSHUFDMask;
13500 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13501 return DAG.getBitcast(
13503 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13504 DAG.getBitcast(MVT::v16i32, V1),
13505 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13508 SmallVector<int, 4> Repeated256Mask;
13509 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13510 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13511 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13514 // Try to use shift instructions.
13515 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13516 Zeroable, Subtarget, DAG))
13519 // Try to use VALIGN.
13520 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13521 Mask, Subtarget, DAG))
13524 // Try to use PALIGNR.
13525 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13526 Mask, Subtarget, DAG))
13529 if (SDValue Unpck =
13530 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13532 // If we have AVX512F support, we can use VEXPAND.
13533 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13534 V2, DAG, Subtarget))
13537 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13538 Zeroable, Subtarget, DAG))
13541 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13544 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13545 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13546 const APInt &Zeroable,
13547 SDValue V1, SDValue V2,
13548 const X86Subtarget &Subtarget,
13549 SelectionDAG &DAG) {
13550 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13551 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13552 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13554 // Whenever we can lower this as a zext, that instruction is strictly faster
13555 // than any alternative. It also allows us to fold memory operands into the
13556 // shuffle in many cases.
13557 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13558 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13561 // If the shuffle mask is repeated in each 128-bit lane we can use more
13562 // efficient instructions that mirror the shuffles across the four 128-bit
13564 SmallVector<int, 4> RepeatedMask;
13565 bool Is128BitLaneRepeatedShuffle =
13566 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13567 if (Is128BitLaneRepeatedShuffle) {
13568 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13570 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13571 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13573 // Use dedicated unpack instructions for masks that match their pattern.
13575 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13579 // Try to use shift instructions.
13580 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13581 Zeroable, Subtarget, DAG))
13584 // Try to use VALIGN.
13585 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13586 Mask, Subtarget, DAG))
13589 // Try to use byte rotation instructions.
13590 if (Subtarget.hasBWI())
13591 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13592 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13595 // Assume that a single SHUFPS is faster than using a permv shuffle.
13596 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13597 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13598 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13599 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13600 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13601 CastV1, CastV2, DAG);
13602 return DAG.getBitcast(MVT::v16i32, ShufPS);
13604 // If we have AVX512F support, we can use VEXPAND.
13605 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13606 V1, V2, DAG, Subtarget))
13609 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13610 Zeroable, Subtarget, DAG))
13612 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13615 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13616 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13617 const APInt &Zeroable,
13618 SDValue V1, SDValue V2,
13619 const X86Subtarget &Subtarget,
13620 SelectionDAG &DAG) {
13621 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13622 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13623 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13624 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13626 // Whenever we can lower this as a zext, that instruction is strictly faster
13627 // than any alternative. It also allows us to fold memory operands into the
13628 // shuffle in many cases.
13629 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13630 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13633 // Use dedicated unpack instructions for masks that match their pattern.
13635 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13638 // Try to use shift instructions.
13639 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13640 Zeroable, Subtarget, DAG))
13643 // Try to use byte rotation instructions.
13644 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13645 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13648 if (V2.isUndef()) {
13649 SmallVector<int, 8> RepeatedMask;
13650 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13651 // As this is a single-input shuffle, the repeated mask should be
13652 // a strictly valid v8i16 mask that we can pass through to the v8i16
13653 // lowering to handle even the v32 case.
13654 return lowerV8I16GeneralSingleInputVectorShuffle(
13655 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13659 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13660 Zeroable, Subtarget, DAG))
13663 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13666 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13667 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13668 const APInt &Zeroable,
13669 SDValue V1, SDValue V2,
13670 const X86Subtarget &Subtarget,
13671 SelectionDAG &DAG) {
13672 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13673 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13674 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13675 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13677 // Whenever we can lower this as a zext, that instruction is strictly faster
13678 // than any alternative. It also allows us to fold memory operands into the
13679 // shuffle in many cases.
13680 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13681 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13684 // Use dedicated unpack instructions for masks that match their pattern.
13686 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13689 // Try to use shift instructions.
13690 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13691 Zeroable, Subtarget, DAG))
13694 // Try to use byte rotation instructions.
13695 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13696 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13699 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13700 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13703 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13704 if (Subtarget.hasVBMI())
13705 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13707 // Try to create an in-lane repeating shuffle mask and then shuffle the
13708 // the results into the target lanes.
13709 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13710 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13713 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13714 Zeroable, Subtarget, DAG))
13717 // FIXME: Implement direct support for this type!
13718 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13721 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13723 /// This routine either breaks down the specific type of a 512-bit x86 vector
13724 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13725 /// together based on the available instructions.
13726 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13727 MVT VT, SDValue V1, SDValue V2,
13728 const APInt &Zeroable,
13729 const X86Subtarget &Subtarget,
13730 SelectionDAG &DAG) {
13731 assert(Subtarget.hasAVX512() &&
13732 "Cannot lower 512-bit vectors w/ basic ISA!");
13734 // If we have a single input to the zero element, insert that into V1 if we
13735 // can do so cheaply.
13736 int NumElts = Mask.size();
13737 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13739 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13740 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13741 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13744 // Check for being able to broadcast a single element.
13745 if (SDValue Broadcast =
13746 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13749 // Dispatch to each element type for lowering. If we don't have support for
13750 // specific element type shuffles at 512 bits, immediately split them and
13751 // lower them. Each lowering routine of a given type is allowed to assume that
13752 // the requisite ISA extensions for that element type are available.
13753 switch (VT.SimpleTy) {
13755 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13757 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13759 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13761 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13763 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13765 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13768 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13772 // Lower vXi1 vector shuffles.
13773 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13774 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13775 // vector, shuffle and then truncate it back.
13776 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13777 MVT VT, SDValue V1, SDValue V2,
13778 const X86Subtarget &Subtarget,
13779 SelectionDAG &DAG) {
13780 assert(Subtarget.hasAVX512() &&
13781 "Cannot lower 512-bit vectors w/o basic ISA!");
13783 switch (VT.SimpleTy) {
13785 llvm_unreachable("Expected a vector of i1 elements");
13787 ExtVT = MVT::v2i64;
13790 ExtVT = MVT::v4i32;
13793 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13796 ExtVT = MVT::v16i32;
13799 ExtVT = MVT::v32i16;
13802 ExtVT = MVT::v64i8;
13806 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13807 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13808 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13809 V1 = getOnesVector(ExtVT, DAG, DL);
13811 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13814 V2 = DAG.getUNDEF(ExtVT);
13815 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13816 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13817 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13818 V2 = getOnesVector(ExtVT, DAG, DL);
13820 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13822 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13823 // i1 was sign extended we can use X86ISD::CVT2MASK.
13824 int NumElems = VT.getVectorNumElements();
13825 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13826 (Subtarget.hasDQI() && (NumElems < 32)))
13827 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13829 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13832 /// Helper function that returns true if the shuffle mask should be
13833 /// commuted to improve canonicalization.
13834 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13835 int NumElements = Mask.size();
13837 int NumV1Elements = 0, NumV2Elements = 0;
13841 else if (M < NumElements)
13846 // Commute the shuffle as needed such that more elements come from V1 than
13847 // V2. This allows us to match the shuffle pattern strictly on how many
13848 // elements come from V1 without handling the symmetric cases.
13849 if (NumV2Elements > NumV1Elements)
13852 assert(NumV1Elements > 0 && "No V1 indices");
13854 if (NumV2Elements == 0)
13857 // When the number of V1 and V2 elements are the same, try to minimize the
13858 // number of uses of V2 in the low half of the vector. When that is tied,
13859 // ensure that the sum of indices for V1 is equal to or lower than the sum
13860 // indices for V2. When those are equal, try to ensure that the number of odd
13861 // indices for V1 is lower than the number of odd indices for V2.
13862 if (NumV1Elements == NumV2Elements) {
13863 int LowV1Elements = 0, LowV2Elements = 0;
13864 for (int M : Mask.slice(0, NumElements / 2))
13865 if (M >= NumElements)
13869 if (LowV2Elements > LowV1Elements)
13871 if (LowV2Elements == LowV1Elements) {
13872 int SumV1Indices = 0, SumV2Indices = 0;
13873 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13874 if (Mask[i] >= NumElements)
13876 else if (Mask[i] >= 0)
13878 if (SumV2Indices < SumV1Indices)
13880 if (SumV2Indices == SumV1Indices) {
13881 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13882 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13883 if (Mask[i] >= NumElements)
13884 NumV2OddIndices += i % 2;
13885 else if (Mask[i] >= 0)
13886 NumV1OddIndices += i % 2;
13887 if (NumV2OddIndices < NumV1OddIndices)
13896 /// \brief Top-level lowering for x86 vector shuffles.
13898 /// This handles decomposition, canonicalization, and lowering of all x86
13899 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13900 /// above in helper routines. The canonicalization attempts to widen shuffles
13901 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13902 /// s.t. only one of the two inputs needs to be tested, etc.
13903 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13904 SelectionDAG &DAG) {
13905 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13906 ArrayRef<int> Mask = SVOp->getMask();
13907 SDValue V1 = Op.getOperand(0);
13908 SDValue V2 = Op.getOperand(1);
13909 MVT VT = Op.getSimpleValueType();
13910 int NumElements = VT.getVectorNumElements();
13912 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13914 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13915 "Can't lower MMX shuffles");
13917 bool V1IsUndef = V1.isUndef();
13918 bool V2IsUndef = V2.isUndef();
13919 if (V1IsUndef && V2IsUndef)
13920 return DAG.getUNDEF(VT);
13922 // When we create a shuffle node we put the UNDEF node to second operand,
13923 // but in some cases the first operand may be transformed to UNDEF.
13924 // In this case we should just commute the node.
13926 return DAG.getCommutedVectorShuffle(*SVOp);
13928 // Check for non-undef masks pointing at an undef vector and make the masks
13929 // undef as well. This makes it easier to match the shuffle based solely on
13933 if (M >= NumElements) {
13934 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13935 for (int &M : NewMask)
13936 if (M >= NumElements)
13938 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13941 // Check for illegal shuffle mask element index values.
13942 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13943 assert(llvm::all_of(Mask,
13944 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13945 "Out of bounds shuffle index");
13947 // We actually see shuffles that are entirely re-arrangements of a set of
13948 // zero inputs. This mostly happens while decomposing complex shuffles into
13949 // simple ones. Directly lower these as a buildvector of zeros.
13950 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13951 if (Zeroable.isAllOnesValue())
13952 return getZeroVector(VT, Subtarget, DAG, DL);
13954 // Try to collapse shuffles into using a vector type with fewer elements but
13955 // wider element types. We cap this to not form integers or floating point
13956 // elements wider than 64 bits, but it might be interesting to form i128
13957 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13958 SmallVector<int, 16> WidenedMask;
13959 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13960 canWidenShuffleElements(Mask, WidenedMask)) {
13961 MVT NewEltVT = VT.isFloatingPoint()
13962 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13963 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13964 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13965 // Make sure that the new vector type is legal. For example, v2f64 isn't
13967 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13968 V1 = DAG.getBitcast(NewVT, V1);
13969 V2 = DAG.getBitcast(NewVT, V2);
13970 return DAG.getBitcast(
13971 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13975 // Commute the shuffle if it will improve canonicalization.
13976 if (canonicalizeShuffleMaskWithCommute(Mask))
13977 return DAG.getCommutedVectorShuffle(*SVOp);
13979 // For each vector width, delegate to a specialized lowering routine.
13980 if (VT.is128BitVector())
13981 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13984 if (VT.is256BitVector())
13985 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13988 if (VT.is512BitVector())
13989 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
13993 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
13995 llvm_unreachable("Unimplemented!");
13998 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
13999 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14000 const X86Subtarget &Subtarget,
14001 SelectionDAG &DAG) {
14002 SDValue Cond = Op.getOperand(0);
14003 SDValue LHS = Op.getOperand(1);
14004 SDValue RHS = Op.getOperand(2);
14006 MVT VT = Op.getSimpleValueType();
14008 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14010 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14012 // Only non-legal VSELECTs reach this lowering, convert those into generic
14013 // shuffles and re-use the shuffle lowering path for blends.
14014 SmallVector<int, 32> Mask;
14015 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14016 SDValue CondElt = CondBV->getOperand(i);
14018 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14021 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14024 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14025 // A vselect where all conditions and data are constants can be optimized into
14026 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14027 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14028 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14029 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14032 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14033 // with patterns on the mask registers on AVX-512.
14034 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14037 // Try to lower this to a blend-style vector shuffle. This can handle all
14038 // constant condition cases.
14039 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14042 // Variable blends are only legal from SSE4.1 onward.
14043 if (!Subtarget.hasSSE41())
14047 MVT VT = Op.getSimpleValueType();
14049 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14050 // into an i1 condition so that we can use the mask-based 512-bit blend
14052 if (VT.getSizeInBits() == 512) {
14053 SDValue Cond = Op.getOperand(0);
14054 // The vNi1 condition case should be handled above as it can be trivially
14056 assert(Cond.getValueType().getScalarSizeInBits() ==
14057 VT.getScalarSizeInBits() &&
14058 "Should have a size-matched integer condition!");
14059 // Build a mask by testing the condition against itself (tests for zero).
14060 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14061 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14062 // Now return a new VSELECT using the mask.
14063 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14066 // Only some types will be legal on some subtargets. If we can emit a legal
14067 // VSELECT-matching blend, return Op, and but if we need to expand, return
14069 switch (VT.SimpleTy) {
14071 // Most of the vector types have blends past SSE4.1.
14075 // The byte blends for AVX vectors were introduced only in AVX2.
14076 if (Subtarget.hasAVX2())
14083 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
14084 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14087 // FIXME: We should custom lower this by fixing the condition and using i8
14093 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14094 MVT VT = Op.getSimpleValueType();
14097 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14100 if (VT.getSizeInBits() == 8) {
14101 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14102 Op.getOperand(0), Op.getOperand(1));
14103 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14104 DAG.getValueType(VT));
14105 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14108 if (VT == MVT::f32) {
14109 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14110 // the result back to FR32 register. It's only worth matching if the
14111 // result has a single use which is a store or a bitcast to i32. And in
14112 // the case of a store, it's not worth it if the index is a constant 0,
14113 // because a MOVSSmr can be used instead, which is smaller and faster.
14114 if (!Op.hasOneUse())
14116 SDNode *User = *Op.getNode()->use_begin();
14117 if ((User->getOpcode() != ISD::STORE ||
14118 isNullConstant(Op.getOperand(1))) &&
14119 (User->getOpcode() != ISD::BITCAST ||
14120 User->getValueType(0) != MVT::i32))
14122 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14123 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14125 return DAG.getBitcast(MVT::f32, Extract);
14128 if (VT == MVT::i32 || VT == MVT::i64) {
14129 // ExtractPS/pextrq works with constant index.
14130 if (isa<ConstantSDNode>(Op.getOperand(1)))
14137 /// Extract one bit from mask vector, like v16i1 or v8i1.
14138 /// AVX-512 feature.
14140 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14141 SDValue Vec = Op.getOperand(0);
14143 MVT VecVT = Vec.getSimpleValueType();
14144 SDValue Idx = Op.getOperand(1);
14145 MVT EltVT = Op.getSimpleValueType();
14147 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14148 "Unexpected vector type in ExtractBitFromMaskVector");
14150 // variable index can't be handled in mask registers,
14151 // extend vector to VR512/128
14152 if (!isa<ConstantSDNode>(Idx)) {
14153 unsigned NumElts = VecVT.getVectorNumElements();
14154 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14155 // than extending to 128/256bit.
14156 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14157 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14158 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14159 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14160 ExtVT.getVectorElementType(), Ext, Idx);
14161 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14164 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14165 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14166 (VecVT.getVectorNumElements() < 8)) {
14167 // Use kshiftlw/rw instruction.
14168 VecVT = MVT::v16i1;
14169 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14170 DAG.getUNDEF(VecVT),
14172 DAG.getIntPtrConstant(0, dl));
14174 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14175 if (MaxSift - IdxVal)
14176 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14177 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14178 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14179 DAG.getConstant(MaxSift, dl, MVT::i8));
14180 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14181 DAG.getIntPtrConstant(0, dl));
14185 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14186 SelectionDAG &DAG) const {
14188 SDValue Vec = Op.getOperand(0);
14189 MVT VecVT = Vec.getSimpleValueType();
14190 SDValue Idx = Op.getOperand(1);
14192 if (VecVT.getVectorElementType() == MVT::i1)
14193 return ExtractBitFromMaskVector(Op, DAG);
14195 if (!isa<ConstantSDNode>(Idx)) {
14196 // Its more profitable to go through memory (1 cycles throughput)
14197 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14198 // IACA tool was used to get performance estimation
14199 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14201 // example : extractelement <16 x i8> %a, i32 %i
14203 // Block Throughput: 3.00 Cycles
14204 // Throughput Bottleneck: Port5
14206 // | Num Of | Ports pressure in cycles | |
14207 // | Uops | 0 - DV | 5 | 6 | 7 | |
14208 // ---------------------------------------------
14209 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14210 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14211 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14212 // Total Num Of Uops: 4
14215 // Block Throughput: 1.00 Cycles
14216 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14218 // | | Ports pressure in cycles | |
14219 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14220 // ---------------------------------------------------------
14221 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14222 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14223 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14224 // Total Num Of Uops: 4
14229 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14231 // If this is a 256-bit vector result, first extract the 128-bit vector and
14232 // then extract the element from the 128-bit vector.
14233 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14234 // Get the 128-bit vector.
14235 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14236 MVT EltVT = VecVT.getVectorElementType();
14238 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14239 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14241 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14242 // this can be done with a mask.
14243 IdxVal &= ElemsPerChunk - 1;
14244 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14245 DAG.getConstant(IdxVal, dl, MVT::i32));
14248 assert(VecVT.is128BitVector() && "Unexpected vector length");
14250 MVT VT = Op.getSimpleValueType();
14252 if (VT.getSizeInBits() == 16) {
14253 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14254 // we're going to zero extend the register or fold the store (SSE41 only).
14255 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14256 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14257 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14258 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14259 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14261 // Transform it so it match pextrw which produces a 32-bit result.
14262 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14263 Op.getOperand(0), Op.getOperand(1));
14264 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14265 DAG.getValueType(VT));
14266 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14269 if (Subtarget.hasSSE41())
14270 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14273 // TODO: We only extract a single element from v16i8, we can probably afford
14274 // to be more aggressive here before using the default approach of spilling to
14276 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14277 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14278 int DWordIdx = IdxVal / 4;
14279 if (DWordIdx == 0) {
14280 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14281 DAG.getBitcast(MVT::v4i32, Vec),
14282 DAG.getIntPtrConstant(DWordIdx, dl));
14283 int ShiftVal = (IdxVal % 4) * 8;
14285 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14286 DAG.getConstant(ShiftVal, dl, MVT::i32));
14287 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14290 int WordIdx = IdxVal / 2;
14291 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14292 DAG.getBitcast(MVT::v8i16, Vec),
14293 DAG.getIntPtrConstant(WordIdx, dl));
14294 int ShiftVal = (IdxVal % 2) * 8;
14296 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14297 DAG.getConstant(ShiftVal, dl, MVT::i16));
14298 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14301 if (VT.getSizeInBits() == 32) {
14305 // SHUFPS the element to the lowest double word, then movss.
14306 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14307 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14308 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14309 DAG.getIntPtrConstant(0, dl));
14312 if (VT.getSizeInBits() == 64) {
14313 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14314 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14315 // to match extract_elt for f64.
14319 // UNPCKHPD the element to the lowest double word, then movsd.
14320 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14321 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14322 int Mask[2] = { 1, -1 };
14323 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14324 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14325 DAG.getIntPtrConstant(0, dl));
14331 /// Insert one bit to mask vector, like v16i1 or v8i1.
14332 /// AVX-512 feature.
14334 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14336 SDValue Vec = Op.getOperand(0);
14337 SDValue Elt = Op.getOperand(1);
14338 SDValue Idx = Op.getOperand(2);
14339 MVT VecVT = Vec.getSimpleValueType();
14341 if (!isa<ConstantSDNode>(Idx)) {
14342 // Non constant index. Extend source and destination,
14343 // insert element and then truncate the result.
14344 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14345 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14346 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14347 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14348 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14349 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14352 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14353 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14354 unsigned NumElems = VecVT.getVectorNumElements();
14356 if(Vec.isUndef()) {
14358 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14359 DAG.getConstant(IdxVal, dl, MVT::i8));
14363 // Insertion of one bit into first position
14364 if (IdxVal == 0 ) {
14365 // Clean top bits of vector.
14366 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14367 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14368 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14369 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14370 // Clean the first bit in source vector.
14371 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14372 DAG.getConstant(1 , dl, MVT::i8));
14373 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14374 DAG.getConstant(1, dl, MVT::i8));
14376 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14378 // Insertion of one bit into last position
14379 if (IdxVal == NumElems -1) {
14380 // Move the bit to the last position inside the vector.
14381 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14382 DAG.getConstant(IdxVal, dl, MVT::i8));
14383 // Clean the last bit in the source vector.
14384 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14385 DAG.getConstant(1, dl, MVT::i8));
14386 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14387 DAG.getConstant(1 , dl, MVT::i8));
14389 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14392 // Use shuffle to insert element.
14393 SmallVector<int, 64> MaskVec(NumElems);
14394 for (unsigned i = 0; i != NumElems; ++i)
14395 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14397 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14400 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14401 SelectionDAG &DAG) const {
14402 MVT VT = Op.getSimpleValueType();
14403 MVT EltVT = VT.getVectorElementType();
14404 unsigned NumElts = VT.getVectorNumElements();
14406 if (EltVT == MVT::i1)
14407 return InsertBitToMaskVector(Op, DAG);
14410 SDValue N0 = Op.getOperand(0);
14411 SDValue N1 = Op.getOperand(1);
14412 SDValue N2 = Op.getOperand(2);
14413 if (!isa<ConstantSDNode>(N2))
14415 auto *N2C = cast<ConstantSDNode>(N2);
14416 unsigned IdxVal = N2C->getZExtValue();
14418 bool IsZeroElt = X86::isZeroNode(N1);
14419 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14421 // If we are inserting a element, see if we can do this more efficiently with
14422 // a blend shuffle with a rematerializable vector than a costly integer
14424 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14425 16 <= EltVT.getSizeInBits()) {
14426 SmallVector<int, 8> BlendMask;
14427 for (unsigned i = 0; i != NumElts; ++i)
14428 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14429 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14430 : DAG.getConstant(-1, dl, VT);
14431 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14434 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14435 // into that, and then insert the subvector back into the result.
14436 if (VT.is256BitVector() || VT.is512BitVector()) {
14437 // With a 256-bit vector, we can insert into the zero element efficiently
14438 // using a blend if we have AVX or AVX2 and the right data type.
14439 if (VT.is256BitVector() && IdxVal == 0) {
14440 // TODO: It is worthwhile to cast integer to floating point and back
14441 // and incur a domain crossing penalty if that's what we'll end up
14442 // doing anyway after extracting to a 128-bit vector.
14443 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14444 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14445 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14446 N2 = DAG.getIntPtrConstant(1, dl);
14447 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14451 // Get the desired 128-bit vector chunk.
14452 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14454 // Insert the element into the desired chunk.
14455 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14456 assert(isPowerOf2_32(NumEltsIn128));
14457 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14458 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14460 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14461 DAG.getConstant(IdxIn128, dl, MVT::i32));
14463 // Insert the changed part back into the bigger vector
14464 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14466 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14468 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14469 // argument. SSE41 required for pinsrb.
14470 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14472 if (VT == MVT::v8i16) {
14473 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14474 Opc = X86ISD::PINSRW;
14476 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14477 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14478 Opc = X86ISD::PINSRB;
14481 if (N1.getValueType() != MVT::i32)
14482 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14483 if (N2.getValueType() != MVT::i32)
14484 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14485 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14488 if (Subtarget.hasSSE41()) {
14489 if (EltVT == MVT::f32) {
14490 // Bits [7:6] of the constant are the source select. This will always be
14491 // zero here. The DAG Combiner may combine an extract_elt index into
14492 // these bits. For example (insert (extract, 3), 2) could be matched by
14493 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14494 // Bits [5:4] of the constant are the destination select. This is the
14495 // value of the incoming immediate.
14496 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14497 // combine either bitwise AND or insert of float 0.0 to set these bits.
14499 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14500 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14501 // If this is an insertion of 32-bits into the low 32-bits of
14502 // a vector, we prefer to generate a blend with immediate rather
14503 // than an insertps. Blends are simpler operations in hardware and so
14504 // will always have equal or better performance than insertps.
14505 // But if optimizing for size and there's a load folding opportunity,
14506 // generate insertps because blendps does not have a 32-bit memory
14508 N2 = DAG.getIntPtrConstant(1, dl);
14509 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14510 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14512 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14513 // Create this as a scalar to vector..
14514 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14515 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14518 // PINSR* works with constant index.
14519 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14526 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14527 SelectionDAG &DAG) {
14529 MVT OpVT = Op.getSimpleValueType();
14531 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14533 if (X86::isZeroNode(Op.getOperand(0)))
14534 return getZeroVector(OpVT, Subtarget, DAG, dl);
14536 // If this is a 256-bit vector result, first insert into a 128-bit
14537 // vector and then insert into the 256-bit vector.
14538 if (!OpVT.is128BitVector()) {
14539 // Insert into a 128-bit vector.
14540 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14541 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14542 OpVT.getVectorNumElements() / SizeFactor);
14544 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14546 // Insert the 128-bit vector.
14547 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14549 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14551 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14552 if (OpVT == MVT::v4i32)
14555 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14556 return DAG.getBitcast(
14557 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14560 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14561 // a simple subregister reference or explicit instructions to grab
14562 // upper bits of a vector.
14563 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14564 SelectionDAG &DAG) {
14565 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14568 SDValue In = Op.getOperand(0);
14569 SDValue Idx = Op.getOperand(1);
14570 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14571 MVT ResVT = Op.getSimpleValueType();
14573 assert((In.getSimpleValueType().is256BitVector() ||
14574 In.getSimpleValueType().is512BitVector()) &&
14575 "Can only extract from 256-bit or 512-bit vectors");
14577 // If the input is a buildvector just emit a smaller one.
14578 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14579 if (In.getOpcode() == ISD::BUILD_VECTOR)
14580 return DAG.getBuildVector(
14581 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14583 // Everything else is legal.
14587 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14588 // simple superregister reference or explicit instructions to insert
14589 // the upper bits of a vector.
14590 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14591 SelectionDAG &DAG) {
14592 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14594 return insert1BitVector(Op, DAG, Subtarget);
14597 // Returns the appropriate wrapper opcode for a global reference.
14598 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14599 // References to absolute symbols are never PC-relative.
14600 if (GV && GV->isAbsoluteSymbolRef())
14601 return X86ISD::Wrapper;
14603 CodeModel::Model M = getTargetMachine().getCodeModel();
14604 if (Subtarget.isPICStyleRIPRel() &&
14605 (M == CodeModel::Small || M == CodeModel::Kernel))
14606 return X86ISD::WrapperRIP;
14608 return X86ISD::Wrapper;
14611 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14612 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14613 // one of the above mentioned nodes. It has to be wrapped because otherwise
14614 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14615 // be used to form addressing mode. These wrapped nodes will be selected
14618 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14619 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14621 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14622 // global base reg.
14623 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14625 auto PtrVT = getPointerTy(DAG.getDataLayout());
14626 SDValue Result = DAG.getTargetConstantPool(
14627 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14629 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14630 // With PIC, the address is actually $g + Offset.
14633 DAG.getNode(ISD::ADD, DL, PtrVT,
14634 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14640 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14641 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14643 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14644 // global base reg.
14645 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14647 auto PtrVT = getPointerTy(DAG.getDataLayout());
14648 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14650 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14652 // With PIC, the address is actually $g + Offset.
14655 DAG.getNode(ISD::ADD, DL, PtrVT,
14656 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14662 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14663 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14665 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14666 // global base reg.
14667 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14668 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14670 auto PtrVT = getPointerTy(DAG.getDataLayout());
14671 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14674 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14676 // With PIC, the address is actually $g + Offset.
14677 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14679 DAG.getNode(ISD::ADD, DL, PtrVT,
14680 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14683 // For symbols that require a load from a stub to get the address, emit the
14685 if (isGlobalStubReference(OpFlag))
14686 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14687 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14693 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14694 // Create the TargetBlockAddressAddress node.
14695 unsigned char OpFlags =
14696 Subtarget.classifyBlockAddressReference();
14697 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14698 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14700 auto PtrVT = getPointerTy(DAG.getDataLayout());
14701 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14702 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14704 // With PIC, the address is actually $g + Offset.
14705 if (isGlobalRelativeToPICBase(OpFlags)) {
14706 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14707 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14713 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14714 const SDLoc &dl, int64_t Offset,
14715 SelectionDAG &DAG) const {
14716 // Create the TargetGlobalAddress node, folding in the constant
14717 // offset if it is legal.
14718 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14719 CodeModel::Model M = DAG.getTarget().getCodeModel();
14720 auto PtrVT = getPointerTy(DAG.getDataLayout());
14722 if (OpFlags == X86II::MO_NO_FLAG &&
14723 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14724 // A direct static reference to a global.
14725 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14728 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14731 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14733 // With PIC, the address is actually $g + Offset.
14734 if (isGlobalRelativeToPICBase(OpFlags)) {
14735 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14736 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14739 // For globals that require a load from a stub to get the address, emit the
14741 if (isGlobalStubReference(OpFlags))
14742 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14743 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14745 // If there was a non-zero offset that we didn't fold, create an explicit
14746 // addition for it.
14748 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14749 DAG.getConstant(Offset, dl, PtrVT));
14755 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14756 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14757 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14758 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14762 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14763 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14764 unsigned char OperandFlags, bool LocalDynamic = false) {
14765 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14766 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14768 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14769 GA->getValueType(0),
14773 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14777 SDValue Ops[] = { Chain, TGA, *InFlag };
14778 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14780 SDValue Ops[] = { Chain, TGA };
14781 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14784 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14785 MFI.setAdjustsStack(true);
14786 MFI.setHasCalls(true);
14788 SDValue Flag = Chain.getValue(1);
14789 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14792 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14794 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14797 SDLoc dl(GA); // ? function entry point might be better
14798 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14799 DAG.getNode(X86ISD::GlobalBaseReg,
14800 SDLoc(), PtrVT), InFlag);
14801 InFlag = Chain.getValue(1);
14803 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14806 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14808 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14810 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14811 X86::RAX, X86II::MO_TLSGD);
14814 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14820 // Get the start address of the TLS block for this module.
14821 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14822 .getInfo<X86MachineFunctionInfo>();
14823 MFI->incNumLocalDynamicTLSAccesses();
14827 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14828 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14831 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14832 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14833 InFlag = Chain.getValue(1);
14834 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14835 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14838 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14842 unsigned char OperandFlags = X86II::MO_DTPOFF;
14843 unsigned WrapperKind = X86ISD::Wrapper;
14844 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14845 GA->getValueType(0),
14846 GA->getOffset(), OperandFlags);
14847 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14849 // Add x@dtpoff with the base.
14850 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14853 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14854 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14855 const EVT PtrVT, TLSModel::Model model,
14856 bool is64Bit, bool isPIC) {
14859 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14860 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14861 is64Bit ? 257 : 256));
14863 SDValue ThreadPointer =
14864 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14865 MachinePointerInfo(Ptr));
14867 unsigned char OperandFlags = 0;
14868 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14870 unsigned WrapperKind = X86ISD::Wrapper;
14871 if (model == TLSModel::LocalExec) {
14872 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14873 } else if (model == TLSModel::InitialExec) {
14875 OperandFlags = X86II::MO_GOTTPOFF;
14876 WrapperKind = X86ISD::WrapperRIP;
14878 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14881 llvm_unreachable("Unexpected model");
14884 // emit "addl x@ntpoff,%eax" (local exec)
14885 // or "addl x@indntpoff,%eax" (initial exec)
14886 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14888 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14889 GA->getOffset(), OperandFlags);
14890 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14892 if (model == TLSModel::InitialExec) {
14893 if (isPIC && !is64Bit) {
14894 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14895 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14899 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14900 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14903 // The address of the thread local variable is the add of the thread
14904 // pointer with the offset of the variable.
14905 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14909 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14911 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14913 if (DAG.getTarget().Options.EmulatedTLS)
14914 return LowerToTLSEmulatedModel(GA, DAG);
14916 const GlobalValue *GV = GA->getGlobal();
14917 auto PtrVT = getPointerTy(DAG.getDataLayout());
14918 bool PositionIndependent = isPositionIndependent();
14920 if (Subtarget.isTargetELF()) {
14921 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14923 case TLSModel::GeneralDynamic:
14924 if (Subtarget.is64Bit())
14925 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14926 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14927 case TLSModel::LocalDynamic:
14928 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14929 Subtarget.is64Bit());
14930 case TLSModel::InitialExec:
14931 case TLSModel::LocalExec:
14932 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14933 PositionIndependent);
14935 llvm_unreachable("Unknown TLS model.");
14938 if (Subtarget.isTargetDarwin()) {
14939 // Darwin only has one model of TLS. Lower to that.
14940 unsigned char OpFlag = 0;
14941 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14942 X86ISD::WrapperRIP : X86ISD::Wrapper;
14944 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14945 // global base reg.
14946 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14948 OpFlag = X86II::MO_TLVP_PIC_BASE;
14950 OpFlag = X86II::MO_TLVP;
14952 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14953 GA->getValueType(0),
14954 GA->getOffset(), OpFlag);
14955 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14957 // With PIC32, the address is actually $g + Offset.
14959 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14960 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14963 // Lowering the machine isd will make sure everything is in the right
14965 SDValue Chain = DAG.getEntryNode();
14966 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14967 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
14968 SDValue Args[] = { Chain, Offset };
14969 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
14970 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
14971 DAG.getIntPtrConstant(0, DL, true),
14972 Chain.getValue(1), DL);
14974 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
14975 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14976 MFI.setAdjustsStack(true);
14978 // And our return value (tls address) is in the standard call return value
14980 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
14981 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
14984 if (Subtarget.isTargetKnownWindowsMSVC() ||
14985 Subtarget.isTargetWindowsItanium() ||
14986 Subtarget.isTargetWindowsGNU()) {
14987 // Just use the implicit TLS architecture
14988 // Need to generate something similar to:
14989 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
14991 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
14992 // mov rcx, qword [rdx+rcx*8]
14993 // mov eax, .tls$:tlsvar
14994 // [rax+rcx] contains the address
14995 // Windows 64bit: gs:0x58
14996 // Windows 32bit: fs:__tls_array
14999 SDValue Chain = DAG.getEntryNode();
15001 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15002 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15003 // use its literal value of 0x2C.
15004 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15005 ? Type::getInt8PtrTy(*DAG.getContext(),
15007 : Type::getInt32PtrTy(*DAG.getContext(),
15010 SDValue TlsArray = Subtarget.is64Bit()
15011 ? DAG.getIntPtrConstant(0x58, dl)
15012 : (Subtarget.isTargetWindowsGNU()
15013 ? DAG.getIntPtrConstant(0x2C, dl)
15014 : DAG.getExternalSymbol("_tls_array", PtrVT));
15016 SDValue ThreadPointer =
15017 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15020 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15021 res = ThreadPointer;
15023 // Load the _tls_index variable
15024 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15025 if (Subtarget.is64Bit())
15026 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15027 MachinePointerInfo(), MVT::i32);
15029 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15031 auto &DL = DAG.getDataLayout();
15033 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15034 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15036 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15039 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15041 // Get the offset of start of .tls section
15042 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15043 GA->getValueType(0),
15044 GA->getOffset(), X86II::MO_SECREL);
15045 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15047 // The address of the thread local variable is the add of the thread
15048 // pointer with the offset of the variable.
15049 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15052 llvm_unreachable("TLS not implemented for this target.");
15055 /// Lower SRA_PARTS and friends, which return two i32 values
15056 /// and take a 2 x i32 value to shift plus a shift amount.
15057 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15058 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15059 MVT VT = Op.getSimpleValueType();
15060 unsigned VTBits = VT.getSizeInBits();
15062 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15063 SDValue ShOpLo = Op.getOperand(0);
15064 SDValue ShOpHi = Op.getOperand(1);
15065 SDValue ShAmt = Op.getOperand(2);
15066 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15067 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15069 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15070 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15071 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15072 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15073 : DAG.getConstant(0, dl, VT);
15075 SDValue Tmp2, Tmp3;
15076 if (Op.getOpcode() == ISD::SHL_PARTS) {
15077 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15078 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15080 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15081 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15084 // If the shift amount is larger or equal than the width of a part we can't
15085 // rely on the results of shld/shrd. Insert a test and select the appropriate
15086 // values for large shift amounts.
15087 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15088 DAG.getConstant(VTBits, dl, MVT::i8));
15089 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15090 AndNode, DAG.getConstant(0, dl, MVT::i8));
15093 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15094 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15095 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15097 if (Op.getOpcode() == ISD::SHL_PARTS) {
15098 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15099 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15101 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15102 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15105 SDValue Ops[2] = { Lo, Hi };
15106 return DAG.getMergeValues(Ops, dl);
15109 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15110 SelectionDAG &DAG) const {
15111 SDValue Src = Op.getOperand(0);
15112 MVT SrcVT = Src.getSimpleValueType();
15113 MVT VT = Op.getSimpleValueType();
15116 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15117 if (SrcVT.isVector()) {
15118 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15119 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15120 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15121 DAG.getUNDEF(SrcVT)));
15123 if (SrcVT.getVectorElementType() == MVT::i1) {
15124 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15125 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15126 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15127 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15128 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15129 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15134 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15135 "Unknown SINT_TO_FP to lower!");
15137 // These are really Legal; return the operand so the caller accepts it as
15139 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15141 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15142 Subtarget.is64Bit()) {
15146 SDValue ValueToStore = Op.getOperand(0);
15147 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15148 !Subtarget.is64Bit())
15149 // Bitcasting to f64 here allows us to do a single 64-bit store from
15150 // an SSE register, avoiding the store forwarding penalty that would come
15151 // with two 32-bit stores.
15152 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15154 unsigned Size = SrcVT.getSizeInBits()/8;
15155 MachineFunction &MF = DAG.getMachineFunction();
15156 auto PtrVT = getPointerTy(MF.getDataLayout());
15157 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15158 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15159 SDValue Chain = DAG.getStore(
15160 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15161 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15162 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15165 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15167 SelectionDAG &DAG) const {
15171 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15173 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15175 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15177 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15179 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15180 MachineMemOperand *MMO;
15182 int SSFI = FI->getIndex();
15183 MMO = DAG.getMachineFunction().getMachineMemOperand(
15184 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15185 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15187 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15188 StackSlot = StackSlot.getOperand(1);
15190 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15191 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15193 Tys, Ops, SrcVT, MMO);
15196 Chain = Result.getValue(1);
15197 SDValue InFlag = Result.getValue(2);
15199 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15200 // shouldn't be necessary except that RFP cannot be live across
15201 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15202 MachineFunction &MF = DAG.getMachineFunction();
15203 unsigned SSFISize = Op.getValueSizeInBits()/8;
15204 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15205 auto PtrVT = getPointerTy(MF.getDataLayout());
15206 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15207 Tys = DAG.getVTList(MVT::Other);
15209 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15211 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15212 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15213 MachineMemOperand::MOStore, SSFISize, SSFISize);
15215 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15216 Ops, Op.getValueType(), MMO);
15217 Result = DAG.getLoad(
15218 Op.getValueType(), DL, Chain, StackSlot,
15219 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15225 /// 64-bit unsigned integer to double expansion.
15226 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15227 SelectionDAG &DAG) const {
15228 // This algorithm is not obvious. Here it is what we're trying to output:
15231 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15232 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15234 haddpd %xmm0, %xmm0
15236 pshufd $0x4e, %xmm0, %xmm1
15242 LLVMContext *Context = DAG.getContext();
15244 // Build some magic constants.
15245 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15246 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15247 auto PtrVT = getPointerTy(DAG.getDataLayout());
15248 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15250 SmallVector<Constant*,2> CV1;
15252 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15253 APInt(64, 0x4330000000000000ULL))));
15255 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15256 APInt(64, 0x4530000000000000ULL))));
15257 Constant *C1 = ConstantVector::get(CV1);
15258 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15260 // Load the 64-bit value into an XMM register.
15261 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15264 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15265 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15266 /* Alignment = */ 16);
15268 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15271 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15272 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15273 /* Alignment = */ 16);
15274 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15275 // TODO: Are there any fast-math-flags to propagate here?
15276 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15279 if (Subtarget.hasSSE3()) {
15280 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15281 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15283 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15284 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15285 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15286 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15289 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15290 DAG.getIntPtrConstant(0, dl));
15293 /// 32-bit unsigned integer to float expansion.
15294 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15295 SelectionDAG &DAG) const {
15297 // FP constant to bias correct the final result.
15298 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15301 // Load the 32-bit value into an XMM register.
15302 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15305 // Zero out the upper parts of the register.
15306 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15308 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15309 DAG.getBitcast(MVT::v2f64, Load),
15310 DAG.getIntPtrConstant(0, dl));
15312 // Or the load with the bias.
15313 SDValue Or = DAG.getNode(
15314 ISD::OR, dl, MVT::v2i64,
15315 DAG.getBitcast(MVT::v2i64,
15316 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15317 DAG.getBitcast(MVT::v2i64,
15318 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15320 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15321 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15323 // Subtract the bias.
15324 // TODO: Are there any fast-math-flags to propagate here?
15325 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15327 // Handle final rounding.
15328 MVT DestVT = Op.getSimpleValueType();
15330 if (DestVT.bitsLT(MVT::f64))
15331 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15332 DAG.getIntPtrConstant(0, dl));
15333 if (DestVT.bitsGT(MVT::f64))
15334 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15336 // Handle final rounding.
15340 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15341 const X86Subtarget &Subtarget, SDLoc &DL) {
15342 if (Op.getSimpleValueType() != MVT::v2f64)
15345 SDValue N0 = Op.getOperand(0);
15346 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15348 // Legalize to v4i32 type.
15349 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15350 DAG.getUNDEF(MVT::v2i32));
15352 if (Subtarget.hasAVX512())
15353 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15355 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15356 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15357 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15358 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15360 // Two to the power of half-word-size.
15361 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15363 // Clear upper part of LO, lower HI.
15364 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15365 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15367 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15368 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15369 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15371 // Add the two halves.
15372 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15375 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15376 const X86Subtarget &Subtarget) {
15377 // The algorithm is the following:
15378 // #ifdef __SSE4_1__
15379 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15380 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15381 // (uint4) 0x53000000, 0xaa);
15383 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15384 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15386 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15387 // return (float4) lo + fhi;
15389 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15390 // reassociate the two FADDs, and if we do that, the algorithm fails
15391 // spectacularly (PR24512).
15392 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15393 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15394 // there's also the MachineCombiner reassociations happening on Machine IR.
15395 if (DAG.getTarget().Options.UnsafeFPMath)
15399 SDValue V = Op->getOperand(0);
15400 MVT VecIntVT = V.getSimpleValueType();
15401 bool Is128 = VecIntVT == MVT::v4i32;
15402 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15403 // If we convert to something else than the supported type, e.g., to v4f64,
15405 if (VecFloatVT != Op->getSimpleValueType(0))
15408 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15409 "Unsupported custom type");
15411 // In the #idef/#else code, we have in common:
15412 // - The vector of constants:
15418 // Create the splat vector for 0x4b000000.
15419 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15420 // Create the splat vector for 0x53000000.
15421 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15423 // Create the right shift.
15424 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15425 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15428 if (Subtarget.hasSSE41()) {
15429 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15430 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15431 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15432 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15433 // Low will be bitcasted right away, so do not bother bitcasting back to its
15435 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15436 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15437 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15438 // (uint4) 0x53000000, 0xaa);
15439 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15440 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15441 // High will be bitcasted right away, so do not bother bitcasting back to
15442 // its original type.
15443 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15444 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15446 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15447 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15448 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15449 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15451 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15452 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15455 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15456 SDValue VecCstFAdd = DAG.getConstantFP(
15457 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15459 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15460 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15461 // TODO: Are there any fast-math-flags to propagate here?
15463 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15464 // return (float4) lo + fhi;
15465 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15466 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15469 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15470 SelectionDAG &DAG) const {
15471 SDValue N0 = Op.getOperand(0);
15472 MVT SrcVT = N0.getSimpleValueType();
15475 if (SrcVT.getVectorElementType() == MVT::i1) {
15476 if (SrcVT == MVT::v2i1)
15477 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15478 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15479 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15480 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15481 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15484 switch (SrcVT.SimpleTy) {
15486 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15491 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15492 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15493 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15496 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15499 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15502 assert(Subtarget.hasAVX512());
15503 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15504 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15508 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15509 SelectionDAG &DAG) const {
15510 SDValue N0 = Op.getOperand(0);
15512 auto PtrVT = getPointerTy(DAG.getDataLayout());
15514 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15515 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15516 // the optimization here.
15517 if (DAG.SignBitIsZero(N0))
15518 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15520 if (Op.getSimpleValueType().isVector())
15521 return lowerUINT_TO_FP_vec(Op, DAG);
15523 MVT SrcVT = N0.getSimpleValueType();
15524 MVT DstVT = Op.getSimpleValueType();
15526 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15527 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15528 // Conversions from unsigned i32 to f32/f64 are legal,
15529 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15533 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15534 return LowerUINT_TO_FP_i64(Op, DAG);
15535 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15536 return LowerUINT_TO_FP_i32(Op, DAG);
15537 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15540 // Make a 64-bit buffer, and use it to build an FILD.
15541 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15542 if (SrcVT == MVT::i32) {
15543 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15544 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15545 StackSlot, MachinePointerInfo());
15546 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15547 OffsetSlot, MachinePointerInfo());
15548 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15552 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15553 SDValue ValueToStore = Op.getOperand(0);
15554 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15555 // Bitcasting to f64 here allows us to do a single 64-bit store from
15556 // an SSE register, avoiding the store forwarding penalty that would come
15557 // with two 32-bit stores.
15558 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15559 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15560 MachinePointerInfo());
15561 // For i64 source, we need to add the appropriate power of 2 if the input
15562 // was negative. This is the same as the optimization in
15563 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15564 // we must be careful to do the computation in x87 extended precision, not
15565 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15566 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15567 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15568 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15569 MachineMemOperand::MOLoad, 8, 8);
15571 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15572 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15573 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15576 APInt FF(32, 0x5F800000ULL);
15578 // Check whether the sign bit is set.
15579 SDValue SignSet = DAG.getSetCC(
15580 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15581 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15583 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15584 SDValue FudgePtr = DAG.getConstantPool(
15585 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15587 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15588 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15589 SDValue Four = DAG.getIntPtrConstant(4, dl);
15590 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15591 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15593 // Load the value out, extending it from f32 to f80.
15594 // FIXME: Avoid the extend by constructing the right constant pool?
15595 SDValue Fudge = DAG.getExtLoad(
15596 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15597 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15598 /* Alignment = */ 4);
15599 // Extend everything to 80 bits to force it to be done on x87.
15600 // TODO: Are there any fast-math-flags to propagate here?
15601 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15602 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15603 DAG.getIntPtrConstant(0, dl));
15606 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15607 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15608 // just return an <SDValue(), SDValue()> pair.
15609 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15610 // to i16, i32 or i64, and we lower it to a legal sequence.
15611 // If lowered to the final integer result we return a <result, SDValue()> pair.
15612 // Otherwise we lower it to a sequence ending with a FIST, return a
15613 // <FIST, StackSlot> pair, and the caller is responsible for loading
15614 // the final integer result from StackSlot.
15615 std::pair<SDValue,SDValue>
15616 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15617 bool IsSigned, bool IsReplace) const {
15620 EVT DstTy = Op.getValueType();
15621 EVT TheVT = Op.getOperand(0).getValueType();
15622 auto PtrVT = getPointerTy(DAG.getDataLayout());
15624 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15625 // f16 must be promoted before using the lowering in this routine.
15626 // fp128 does not use this lowering.
15627 return std::make_pair(SDValue(), SDValue());
15630 // If using FIST to compute an unsigned i64, we'll need some fixup
15631 // to handle values above the maximum signed i64. A FIST is always
15632 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15633 bool UnsignedFixup = !IsSigned &&
15634 DstTy == MVT::i64 &&
15635 (!Subtarget.is64Bit() ||
15636 !isScalarFPTypeInSSEReg(TheVT));
15638 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15639 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15640 // The low 32 bits of the fist result will have the correct uint32 result.
15641 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15645 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15646 DstTy.getSimpleVT() >= MVT::i16 &&
15647 "Unknown FP_TO_INT to lower!");
15649 // These are really Legal.
15650 if (DstTy == MVT::i32 &&
15651 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15652 return std::make_pair(SDValue(), SDValue());
15653 if (Subtarget.is64Bit() &&
15654 DstTy == MVT::i64 &&
15655 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15656 return std::make_pair(SDValue(), SDValue());
15658 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15660 MachineFunction &MF = DAG.getMachineFunction();
15661 unsigned MemSize = DstTy.getSizeInBits()/8;
15662 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15663 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15666 switch (DstTy.getSimpleVT().SimpleTy) {
15667 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15668 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15669 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15670 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15673 SDValue Chain = DAG.getEntryNode();
15674 SDValue Value = Op.getOperand(0);
15675 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15677 if (UnsignedFixup) {
15679 // Conversion to unsigned i64 is implemented with a select,
15680 // depending on whether the source value fits in the range
15681 // of a signed i64. Let Thresh be the FP equivalent of
15682 // 0x8000000000000000ULL.
15684 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15685 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15686 // Fist-to-mem64 FistSrc
15687 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15688 // to XOR'ing the high 32 bits with Adjust.
15690 // Being a power of 2, Thresh is exactly representable in all FP formats.
15691 // For X87 we'd like to use the smallest FP type for this constant, but
15692 // for DAG type consistency we have to match the FP operand type.
15694 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15695 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15696 bool LosesInfo = false;
15697 if (TheVT == MVT::f64)
15698 // The rounding mode is irrelevant as the conversion should be exact.
15699 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15701 else if (TheVT == MVT::f80)
15702 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15703 APFloat::rmNearestTiesToEven, &LosesInfo);
15705 assert(Status == APFloat::opOK && !LosesInfo &&
15706 "FP conversion should have been exact");
15708 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15710 SDValue Cmp = DAG.getSetCC(DL,
15711 getSetCCResultType(DAG.getDataLayout(),
15712 *DAG.getContext(), TheVT),
15713 Value, ThreshVal, ISD::SETLT);
15714 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15715 DAG.getConstant(0, DL, MVT::i32),
15716 DAG.getConstant(0x80000000, DL, MVT::i32));
15717 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15718 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15719 *DAG.getContext(), TheVT),
15720 Value, ThreshVal, ISD::SETLT);
15721 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15724 // FIXME This causes a redundant load/store if the SSE-class value is already
15725 // in memory, such as if it is on the callstack.
15726 if (isScalarFPTypeInSSEReg(TheVT)) {
15727 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15728 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15729 MachinePointerInfo::getFixedStack(MF, SSFI));
15730 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15732 Chain, StackSlot, DAG.getValueType(TheVT)
15735 MachineMemOperand *MMO =
15736 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15737 MachineMemOperand::MOLoad, MemSize, MemSize);
15738 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15739 Chain = Value.getValue(1);
15740 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15741 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15744 MachineMemOperand *MMO =
15745 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15746 MachineMemOperand::MOStore, MemSize, MemSize);
15748 if (UnsignedFixup) {
15750 // Insert the FIST, load its result as two i32's,
15751 // and XOR the high i32 with Adjust.
15753 SDValue FistOps[] = { Chain, Value, StackSlot };
15754 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15755 FistOps, DstTy, MMO);
15758 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15759 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15762 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15763 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15765 if (Subtarget.is64Bit()) {
15766 // Join High32 and Low32 into a 64-bit result.
15767 // (High32 << 32) | Low32
15768 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15769 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15770 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15771 DAG.getConstant(32, DL, MVT::i8));
15772 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15773 return std::make_pair(Result, SDValue());
15776 SDValue ResultOps[] = { Low32, High32 };
15778 SDValue pair = IsReplace
15779 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15780 : DAG.getMergeValues(ResultOps, DL);
15781 return std::make_pair(pair, SDValue());
15783 // Build the FP_TO_INT*_IN_MEM
15784 SDValue Ops[] = { Chain, Value, StackSlot };
15785 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15787 return std::make_pair(FIST, StackSlot);
15791 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15792 const X86Subtarget &Subtarget) {
15793 MVT VT = Op->getSimpleValueType(0);
15794 SDValue In = Op->getOperand(0);
15795 MVT InVT = In.getSimpleValueType();
15798 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15799 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15801 // Optimize vectors in AVX mode:
15804 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15805 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15806 // Concat upper and lower parts.
15809 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15810 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15811 // Concat upper and lower parts.
15814 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15815 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15816 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15819 if (Subtarget.hasInt256())
15820 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15822 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15823 SDValue Undef = DAG.getUNDEF(InVT);
15824 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15825 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15826 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15828 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15829 VT.getVectorNumElements()/2);
15831 OpLo = DAG.getBitcast(HVT, OpLo);
15832 OpHi = DAG.getBitcast(HVT, OpHi);
15834 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15837 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15838 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15839 MVT VT = Op->getSimpleValueType(0);
15840 SDValue In = Op->getOperand(0);
15841 MVT InVT = In.getSimpleValueType();
15843 unsigned NumElts = VT.getVectorNumElements();
15845 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15846 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15847 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15849 if (InVT.getVectorElementType() != MVT::i1)
15852 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15854 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15855 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15858 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15860 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15862 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15864 return SelectedVal;
15865 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15868 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15869 SelectionDAG &DAG) {
15870 if (Subtarget.hasFp256())
15871 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15877 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15878 SelectionDAG &DAG) {
15880 MVT VT = Op.getSimpleValueType();
15881 SDValue In = Op.getOperand(0);
15882 MVT SVT = In.getSimpleValueType();
15884 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15885 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15887 if (Subtarget.hasFp256())
15888 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15891 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15892 VT.getVectorNumElements() != SVT.getVectorNumElements());
15896 /// Helper to recursively truncate vector elements in half with PACKSS.
15897 /// It makes use of the fact that vector comparison results will be all-zeros
15898 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15899 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15900 /// within each 128-bit lane.
15901 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15904 const X86Subtarget &Subtarget) {
15905 // Requires SSE2 but AVX512 has fast truncate.
15906 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15909 EVT SrcVT = In.getValueType();
15911 // No truncation required, we might get here due to recursive calls.
15912 if (SrcVT == DstVT)
15915 // We only support vector truncation to 128bits or greater from a
15916 // 256bits or greater source.
15917 if ((DstVT.getSizeInBits() % 128) != 0)
15919 if ((SrcVT.getSizeInBits() % 256) != 0)
15922 unsigned NumElems = SrcVT.getVectorNumElements();
15923 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15924 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15927 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15929 // Extract lower/upper subvectors.
15930 unsigned NumSubElts = NumElems / 2;
15931 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15932 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15933 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15935 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15936 if (SrcVT.is256BitVector()) {
15937 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15938 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15939 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15940 return DAG.getBitcast(DstVT, Res);
15943 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15944 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15945 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15946 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15947 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15948 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15950 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15951 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15952 Res = DAG.getBitcast(MVT::v4i64, Res);
15953 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15955 if (DstVT.is256BitVector())
15956 return DAG.getBitcast(DstVT, Res);
15958 // If 512bit -> 128bit truncate another stage.
15959 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15960 Res = DAG.getBitcast(PackedVT, Res);
15961 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15964 // Recursively pack lower/upper subvectors, concat result and pack again.
15965 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15966 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15967 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
15968 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
15970 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15971 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
15972 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15975 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
15976 const X86Subtarget &Subtarget) {
15979 MVT VT = Op.getSimpleValueType();
15980 SDValue In = Op.getOperand(0);
15981 MVT InVT = In.getSimpleValueType();
15983 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
15985 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
15986 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
15987 if (InVT.getScalarSizeInBits() <= 16) {
15988 if (Subtarget.hasBWI()) {
15989 // legal, will go to VPMOVB2M, VPMOVW2M
15990 // Shift packed bytes not supported natively, bitcast to word
15991 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
15992 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
15993 DAG.getBitcast(ExtVT, In),
15994 DAG.getConstant(ShiftInx, DL, ExtVT));
15995 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
15996 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
15998 // Use TESTD/Q, extended vector to packed dword/qword.
15999 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16000 "Unexpected vector type.");
16001 unsigned NumElts = InVT.getVectorNumElements();
16002 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16003 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16005 ShiftInx = InVT.getScalarSizeInBits() - 1;
16008 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16009 DAG.getConstant(ShiftInx, DL, InVT));
16010 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16013 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16015 MVT VT = Op.getSimpleValueType();
16016 SDValue In = Op.getOperand(0);
16017 MVT InVT = In.getSimpleValueType();
16019 if (VT == MVT::i1) {
16020 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
16021 "Invalid scalar TRUNCATE operation");
16022 if (InVT.getSizeInBits() >= 32)
16024 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
16025 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16027 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16028 "Invalid TRUNCATE operation");
16030 if (VT.getVectorElementType() == MVT::i1)
16031 return LowerTruncateVecI1(Op, DAG, Subtarget);
16033 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16034 if (Subtarget.hasAVX512()) {
16035 // word to byte only under BWI
16036 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16037 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16038 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16039 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16042 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
16043 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
16044 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
16047 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16048 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16049 if (Subtarget.hasInt256()) {
16050 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16051 In = DAG.getBitcast(MVT::v8i32, In);
16052 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16053 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16054 DAG.getIntPtrConstant(0, DL));
16057 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16058 DAG.getIntPtrConstant(0, DL));
16059 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16060 DAG.getIntPtrConstant(2, DL));
16061 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16062 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16063 static const int ShufMask[] = {0, 2, 4, 6};
16064 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16067 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16068 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16069 if (Subtarget.hasInt256()) {
16070 In = DAG.getBitcast(MVT::v32i8, In);
16072 // The PSHUFB mask:
16073 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16074 -1, -1, -1, -1, -1, -1, -1, -1,
16075 16, 17, 20, 21, 24, 25, 28, 29,
16076 -1, -1, -1, -1, -1, -1, -1, -1 };
16077 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16078 In = DAG.getBitcast(MVT::v4i64, In);
16080 static const int ShufMask2[] = {0, 2, -1, -1};
16081 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16082 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16083 DAG.getIntPtrConstant(0, DL));
16084 return DAG.getBitcast(VT, In);
16087 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16088 DAG.getIntPtrConstant(0, DL));
16090 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16091 DAG.getIntPtrConstant(4, DL));
16093 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16094 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16096 // The PSHUFB mask:
16097 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16098 -1, -1, -1, -1, -1, -1, -1, -1};
16100 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16101 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16103 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16104 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16106 // The MOVLHPS Mask:
16107 static const int ShufMask2[] = {0, 1, 4, 5};
16108 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16109 return DAG.getBitcast(MVT::v8i16, res);
16112 // Handle truncation of V256 to V128 using shuffles.
16113 if (!VT.is128BitVector() || !InVT.is256BitVector())
16116 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16118 unsigned NumElems = VT.getVectorNumElements();
16119 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16121 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16122 // Prepare truncation shuffle mask
16123 for (unsigned i = 0; i != NumElems; ++i)
16124 MaskVec[i] = i * 2;
16125 In = DAG.getBitcast(NVT, In);
16126 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16127 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16128 DAG.getIntPtrConstant(0, DL));
16131 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16132 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16133 MVT VT = Op.getSimpleValueType();
16135 if (VT.isVector()) {
16136 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16137 SDValue Src = Op.getOperand(0);
16139 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16140 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16141 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16142 DAG.getUNDEF(MVT::v2f32)));
16148 assert(!VT.isVector());
16150 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16151 IsSigned, /*IsReplace=*/ false);
16152 SDValue FIST = Vals.first, StackSlot = Vals.second;
16153 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16154 if (!FIST.getNode())
16157 if (StackSlot.getNode())
16158 // Load the result.
16159 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16161 // The node is the result.
16165 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16167 MVT VT = Op.getSimpleValueType();
16168 SDValue In = Op.getOperand(0);
16169 MVT SVT = In.getSimpleValueType();
16171 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16173 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16174 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16175 In, DAG.getUNDEF(SVT)));
16178 /// The only differences between FABS and FNEG are the mask and the logic op.
16179 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16180 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16181 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16182 "Wrong opcode for lowering FABS or FNEG.");
16184 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16186 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16187 // into an FNABS. We'll lower the FABS after that if it is still in use.
16189 for (SDNode *User : Op->uses())
16190 if (User->getOpcode() == ISD::FNEG)
16194 MVT VT = Op.getSimpleValueType();
16196 bool IsF128 = (VT == MVT::f128);
16198 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16199 // decide if we should generate a 16-byte constant mask when we only need 4 or
16200 // 8 bytes for the scalar case.
16205 if (VT.isVector()) {
16207 EltVT = VT.getVectorElementType();
16208 } else if (IsF128) {
16209 // SSE instructions are used for optimized f128 logical operations.
16210 LogicVT = MVT::f128;
16213 // There are no scalar bitwise logical SSE/AVX instructions, so we
16214 // generate a 16-byte vector constant and logic op even for the scalar case.
16215 // Using a 16-byte mask allows folding the load of the mask with
16216 // the logic op, so it can save (~4 bytes) on code size.
16217 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16221 unsigned EltBits = EltVT.getSizeInBits();
16222 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16224 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16225 const fltSemantics &Sem =
16226 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16227 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16228 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16230 SDValue Op0 = Op.getOperand(0);
16231 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16233 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16234 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16236 if (VT.isVector() || IsF128)
16237 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16239 // For the scalar case extend to a 128-bit vector, perform the logic op,
16240 // and extract the scalar result back out.
16241 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16242 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16243 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16244 DAG.getIntPtrConstant(0, dl));
16247 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16248 SDValue Mag = Op.getOperand(0);
16249 SDValue Sign = Op.getOperand(1);
16252 // If the sign operand is smaller, extend it first.
16253 MVT VT = Op.getSimpleValueType();
16254 if (Sign.getSimpleValueType().bitsLT(VT))
16255 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16257 // And if it is bigger, shrink it first.
16258 if (Sign.getSimpleValueType().bitsGT(VT))
16259 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16261 // At this point the operands and the result should have the same
16262 // type, and that won't be f80 since that is not custom lowered.
16263 bool IsF128 = (VT == MVT::f128);
16264 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16265 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16266 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16267 "Unexpected type in LowerFCOPYSIGN");
16269 MVT EltVT = VT.getScalarType();
16270 const fltSemantics &Sem =
16271 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16272 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16274 // Perform all scalar logic operations as 16-byte vectors because there are no
16275 // scalar FP logic instructions in SSE.
16276 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16277 // unnecessary splats, but we might miss load folding opportunities. Should
16278 // this decision be based on OptimizeForSize?
16279 bool IsFakeVector = !VT.isVector() && !IsF128;
16282 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16284 // The mask constants are automatically splatted for vector types.
16285 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16286 SDValue SignMask = DAG.getConstantFP(
16287 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16288 SDValue MagMask = DAG.getConstantFP(
16289 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16291 // First, clear all bits but the sign bit from the second operand (sign).
16293 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16294 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16296 // Next, clear the sign bit from the first operand (magnitude).
16297 // TODO: If we had general constant folding for FP logic ops, this check
16298 // wouldn't be necessary.
16300 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16301 APFloat APF = Op0CN->getValueAPF();
16303 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16305 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16307 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16308 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16311 // OR the magnitude value with the sign bit.
16312 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16313 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16314 DAG.getIntPtrConstant(0, dl));
16317 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16318 SDValue N0 = Op.getOperand(0);
16320 MVT VT = Op.getSimpleValueType();
16322 MVT OpVT = N0.getSimpleValueType();
16323 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16324 "Unexpected type for FGETSIGN");
16326 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16327 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16328 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16329 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16330 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16331 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16335 // Check whether an OR'd tree is PTEST-able.
16336 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16337 SelectionDAG &DAG) {
16338 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16340 if (!Subtarget.hasSSE41())
16343 if (!Op->hasOneUse())
16346 SDNode *N = Op.getNode();
16349 SmallVector<SDValue, 8> Opnds;
16350 DenseMap<SDValue, unsigned> VecInMap;
16351 SmallVector<SDValue, 8> VecIns;
16352 EVT VT = MVT::Other;
16354 // Recognize a special case where a vector is casted into wide integer to
16356 Opnds.push_back(N->getOperand(0));
16357 Opnds.push_back(N->getOperand(1));
16359 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16360 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16361 // BFS traverse all OR'd operands.
16362 if (I->getOpcode() == ISD::OR) {
16363 Opnds.push_back(I->getOperand(0));
16364 Opnds.push_back(I->getOperand(1));
16365 // Re-evaluate the number of nodes to be traversed.
16366 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16370 // Quit if a non-EXTRACT_VECTOR_ELT
16371 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16374 // Quit if without a constant index.
16375 SDValue Idx = I->getOperand(1);
16376 if (!isa<ConstantSDNode>(Idx))
16379 SDValue ExtractedFromVec = I->getOperand(0);
16380 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16381 if (M == VecInMap.end()) {
16382 VT = ExtractedFromVec.getValueType();
16383 // Quit if not 128/256-bit vector.
16384 if (!VT.is128BitVector() && !VT.is256BitVector())
16386 // Quit if not the same type.
16387 if (VecInMap.begin() != VecInMap.end() &&
16388 VT != VecInMap.begin()->first.getValueType())
16390 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16391 VecIns.push_back(ExtractedFromVec);
16393 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16396 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16397 "Not extracted from 128-/256-bit vector.");
16399 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16401 for (DenseMap<SDValue, unsigned>::const_iterator
16402 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16403 // Quit if not all elements are used.
16404 if (I->second != FullMask)
16408 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16410 // Cast all vectors into TestVT for PTEST.
16411 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16412 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16414 // If more than one full vector is evaluated, OR them first before PTEST.
16415 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16416 // Each iteration will OR 2 nodes and append the result until there is only
16417 // 1 node left, i.e. the final OR'd value of all vectors.
16418 SDValue LHS = VecIns[Slot];
16419 SDValue RHS = VecIns[Slot + 1];
16420 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16423 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16426 /// \brief return true if \c Op has a use that doesn't just read flags.
16427 static bool hasNonFlagsUse(SDValue Op) {
16428 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16430 SDNode *User = *UI;
16431 unsigned UOpNo = UI.getOperandNo();
16432 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16433 // Look pass truncate.
16434 UOpNo = User->use_begin().getOperandNo();
16435 User = *User->use_begin();
16438 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16439 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16445 // Emit KTEST instruction for bit vectors on AVX-512
16446 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16447 const X86Subtarget &Subtarget) {
16448 if (Op.getOpcode() == ISD::BITCAST) {
16449 auto hasKTEST = [&](MVT VT) {
16450 unsigned SizeInBits = VT.getSizeInBits();
16451 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16452 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16454 SDValue Op0 = Op.getOperand(0);
16455 MVT Op0VT = Op0.getValueType().getSimpleVT();
16456 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16458 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16463 /// Emit nodes that will be selected as "test Op0,Op0", or something
16465 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16466 SelectionDAG &DAG) const {
16467 if (Op.getValueType() == MVT::i1) {
16468 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16469 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16470 DAG.getConstant(0, dl, MVT::i8));
16472 // CF and OF aren't always set the way we want. Determine which
16473 // of these we need.
16474 bool NeedCF = false;
16475 bool NeedOF = false;
16478 case X86::COND_A: case X86::COND_AE:
16479 case X86::COND_B: case X86::COND_BE:
16482 case X86::COND_G: case X86::COND_GE:
16483 case X86::COND_L: case X86::COND_LE:
16484 case X86::COND_O: case X86::COND_NO: {
16485 // Check if we really need to set the
16486 // Overflow flag. If NoSignedWrap is present
16487 // that is not actually needed.
16488 switch (Op->getOpcode()) {
16493 if (Op.getNode()->getFlags().hasNoSignedWrap())
16503 // See if we can use the EFLAGS value from the operand instead of
16504 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16505 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16506 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16507 // Emit KTEST for bit vectors
16508 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16510 // Emit a CMP with 0, which is the TEST pattern.
16511 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16512 DAG.getConstant(0, dl, Op.getValueType()));
16514 unsigned Opcode = 0;
16515 unsigned NumOperands = 0;
16517 // Truncate operations may prevent the merge of the SETCC instruction
16518 // and the arithmetic instruction before it. Attempt to truncate the operands
16519 // of the arithmetic instruction and use a reduced bit-width instruction.
16520 bool NeedTruncation = false;
16521 SDValue ArithOp = Op;
16522 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16523 SDValue Arith = Op->getOperand(0);
16524 // Both the trunc and the arithmetic op need to have one user each.
16525 if (Arith->hasOneUse())
16526 switch (Arith.getOpcode()) {
16533 NeedTruncation = true;
16539 // Sometimes flags can be set either with an AND or with an SRL/SHL
16540 // instruction. SRL/SHL variant should be preferred for masks longer than this
16542 const int ShiftToAndMaxMaskWidth = 32;
16543 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16545 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16546 // which may be the result of a CAST. We use the variable 'Op', which is the
16547 // non-casted variable when we check for possible users.
16548 switch (ArithOp.getOpcode()) {
16550 // Due to an isel shortcoming, be conservative if this add is likely to be
16551 // selected as part of a load-modify-store instruction. When the root node
16552 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16553 // uses of other nodes in the match, such as the ADD in this case. This
16554 // leads to the ADD being left around and reselected, with the result being
16555 // two adds in the output. Alas, even if none our users are stores, that
16556 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16557 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16558 // climbing the DAG back to the root, and it doesn't seem to be worth the
16560 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16561 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16562 if (UI->getOpcode() != ISD::CopyToReg &&
16563 UI->getOpcode() != ISD::SETCC &&
16564 UI->getOpcode() != ISD::STORE)
16567 if (ConstantSDNode *C =
16568 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16569 // An add of one will be selected as an INC.
16570 if (C->isOne() && !Subtarget.slowIncDec()) {
16571 Opcode = X86ISD::INC;
16576 // An add of negative one (subtract of one) will be selected as a DEC.
16577 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16578 Opcode = X86ISD::DEC;
16584 // Otherwise use a regular EFLAGS-setting add.
16585 Opcode = X86ISD::ADD;
16590 // If we have a constant logical shift that's only used in a comparison
16591 // against zero turn it into an equivalent AND. This allows turning it into
16592 // a TEST instruction later.
16593 if (ZeroCheck && Op->hasOneUse() &&
16594 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16595 EVT VT = Op.getValueType();
16596 unsigned BitWidth = VT.getSizeInBits();
16597 unsigned ShAmt = Op->getConstantOperandVal(1);
16598 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16600 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16601 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16602 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16603 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16605 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16606 DAG.getConstant(Mask, dl, VT));
16611 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16612 // because a TEST instruction will be better. However, AND should be
16613 // preferred if the instruction can be combined into ANDN.
16614 if (!hasNonFlagsUse(Op)) {
16615 SDValue Op0 = ArithOp->getOperand(0);
16616 SDValue Op1 = ArithOp->getOperand(1);
16617 EVT VT = ArithOp.getValueType();
16618 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16619 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16620 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16622 // If we cannot select an ANDN instruction, check if we can replace
16623 // AND+IMM64 with a shift before giving up. This is possible for masks
16624 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16625 if (!isProperAndn) {
16629 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16630 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16634 const APInt &Mask = CN->getAPIntValue();
16635 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16636 break; // Prefer TEST instruction.
16638 unsigned BitWidth = Mask.getBitWidth();
16639 unsigned LeadingOnes = Mask.countLeadingOnes();
16640 unsigned TrailingZeros = Mask.countTrailingZeros();
16642 if (LeadingOnes + TrailingZeros == BitWidth) {
16643 assert(TrailingZeros < VT.getSizeInBits() &&
16644 "Shift amount should be less than the type width");
16645 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16646 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16647 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16651 unsigned LeadingZeros = Mask.countLeadingZeros();
16652 unsigned TrailingOnes = Mask.countTrailingOnes();
16654 if (LeadingZeros + TrailingOnes == BitWidth) {
16655 assert(LeadingZeros < VT.getSizeInBits() &&
16656 "Shift amount should be less than the type width");
16657 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16658 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16659 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16670 // Due to the ISEL shortcoming noted above, be conservative if this op is
16671 // likely to be selected as part of a load-modify-store instruction.
16672 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16673 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16674 if (UI->getOpcode() == ISD::STORE)
16677 // Otherwise use a regular EFLAGS-setting instruction.
16678 switch (ArithOp.getOpcode()) {
16679 default: llvm_unreachable("unexpected operator!");
16680 case ISD::SUB: Opcode = X86ISD::SUB; break;
16681 case ISD::XOR: Opcode = X86ISD::XOR; break;
16682 case ISD::AND: Opcode = X86ISD::AND; break;
16684 if (!NeedTruncation && ZeroCheck) {
16685 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16688 Opcode = X86ISD::OR;
16702 return SDValue(Op.getNode(), 1);
16708 // If we found that truncation is beneficial, perform the truncation and
16710 if (NeedTruncation) {
16711 EVT VT = Op.getValueType();
16712 SDValue WideVal = Op->getOperand(0);
16713 EVT WideVT = WideVal.getValueType();
16714 unsigned ConvertedOp = 0;
16715 // Use a target machine opcode to prevent further DAGCombine
16716 // optimizations that may separate the arithmetic operations
16717 // from the setcc node.
16718 switch (WideVal.getOpcode()) {
16720 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16721 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16722 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16723 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16724 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16728 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16729 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16730 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16731 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16732 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16738 // Emit KTEST for bit vectors
16739 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16742 // Emit a CMP with 0, which is the TEST pattern.
16743 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16744 DAG.getConstant(0, dl, Op.getValueType()));
16746 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16747 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16749 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16750 DAG.ReplaceAllUsesWith(Op, New);
16751 return SDValue(New.getNode(), 1);
16754 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16756 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16757 const SDLoc &dl, SelectionDAG &DAG) const {
16758 if (isNullConstant(Op1))
16759 return EmitTest(Op0, X86CC, dl, DAG);
16761 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16762 "Unexpected comparison operation for MVT::i1 operands");
16764 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16765 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16766 // Only promote the compare up to I32 if it is a 16 bit operation
16767 // with an immediate. 16 bit immediates are to be avoided.
16768 if ((Op0.getValueType() == MVT::i16 &&
16769 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16770 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16771 !Subtarget.isAtom()) {
16772 unsigned ExtendOp =
16773 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16774 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16775 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16777 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16778 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16779 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16781 return SDValue(Sub.getNode(), 1);
16783 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16786 /// Convert a comparison if required by the subtarget.
16787 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16788 SelectionDAG &DAG) const {
16789 // If the subtarget does not support the FUCOMI instruction, floating-point
16790 // comparisons have to be converted.
16791 if (Subtarget.hasCMov() ||
16792 Cmp.getOpcode() != X86ISD::CMP ||
16793 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16794 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16797 // The instruction selector will select an FUCOM instruction instead of
16798 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16799 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16800 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16802 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16803 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16804 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16805 DAG.getConstant(8, dl, MVT::i8));
16806 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16808 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16809 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16810 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16813 /// Check if replacement of SQRT with RSQRT should be disabled.
16814 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16815 EVT VT = Op.getValueType();
16817 // We never want to use both SQRT and RSQRT instructions for the same input.
16818 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16822 return Subtarget.hasFastVectorFSQRT();
16823 return Subtarget.hasFastScalarFSQRT();
16826 /// The minimum architected relative accuracy is 2^-12. We need one
16827 /// Newton-Raphson step to have a good float result (24 bits of precision).
16828 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16829 SelectionDAG &DAG, int Enabled,
16830 int &RefinementSteps,
16831 bool &UseOneConstNR,
16832 bool Reciprocal) const {
16833 EVT VT = Op.getValueType();
16835 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16836 // TODO: Add support for AVX512 (v16f32).
16837 // It is likely not profitable to do this for f64 because a double-precision
16838 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16839 // instructions: convert to single, rsqrtss, convert back to double, refine
16840 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16841 // along with FMA, this could be a throughput win.
16842 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16843 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16844 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16845 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16846 RefinementSteps = 1;
16848 UseOneConstNR = false;
16849 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16854 /// The minimum architected relative accuracy is 2^-12. We need one
16855 /// Newton-Raphson step to have a good float result (24 bits of precision).
16856 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16858 int &RefinementSteps) const {
16859 EVT VT = Op.getValueType();
16861 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16862 // TODO: Add support for AVX512 (v16f32).
16863 // It is likely not profitable to do this for f64 because a double-precision
16864 // reciprocal estimate with refinement on x86 prior to FMA requires
16865 // 15 instructions: convert to single, rcpss, convert back to double, refine
16866 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16867 // along with FMA, this could be a throughput win.
16869 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16870 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16871 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16872 // Enable estimate codegen with 1 refinement step for vector division.
16873 // Scalar division estimates are disabled because they break too much
16874 // real-world code. These defaults are intended to match GCC behavior.
16875 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16878 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16879 RefinementSteps = 1;
16881 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16886 /// If we have at least two divisions that use the same divisor, convert to
16887 /// multiplication by a reciprocal. This may need to be adjusted for a given
16888 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16889 /// This is because we still need one division to calculate the reciprocal and
16890 /// then we need two multiplies by that reciprocal as replacements for the
16891 /// original divisions.
16892 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16896 /// Helper for creating a X86ISD::SETCC node.
16897 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16898 SelectionDAG &DAG) {
16899 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16900 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16903 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16904 /// according to equal/not-equal condition code \p CC.
16905 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16906 const SDLoc &dl, SelectionDAG &DAG) {
16907 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16908 // instruction. Since the shift amount is in-range-or-undefined, we know
16909 // that doing a bittest on the i32 value is ok. We extend to i32 because
16910 // the encoding for the i16 version is larger than the i32 version.
16911 // Also promote i16 to i32 for performance / code size reason.
16912 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16913 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16915 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16916 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16917 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16918 // known to be zero.
16919 if (Src.getValueType() == MVT::i64 &&
16920 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16921 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16923 // If the operand types disagree, extend the shift amount to match. Since
16924 // BT ignores high bits (like shifts) we can use anyextend.
16925 if (Src.getValueType() != BitNo.getValueType())
16926 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16928 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16929 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16930 return getSETCC(Cond, BT, dl , DAG);
16933 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16934 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16935 const SDLoc &dl, SelectionDAG &DAG) {
16936 SDValue Op0 = And.getOperand(0);
16937 SDValue Op1 = And.getOperand(1);
16938 if (Op0.getOpcode() == ISD::TRUNCATE)
16939 Op0 = Op0.getOperand(0);
16940 if (Op1.getOpcode() == ISD::TRUNCATE)
16941 Op1 = Op1.getOperand(0);
16944 if (Op1.getOpcode() == ISD::SHL)
16945 std::swap(Op0, Op1);
16946 if (Op0.getOpcode() == ISD::SHL) {
16947 if (isOneConstant(Op0.getOperand(0))) {
16948 // If we looked past a truncate, check that it's only truncating away
16950 unsigned BitWidth = Op0.getValueSizeInBits();
16951 unsigned AndBitWidth = And.getValueSizeInBits();
16952 if (BitWidth > AndBitWidth) {
16954 DAG.computeKnownBits(Op0, Known);
16955 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16959 RHS = Op0.getOperand(1);
16961 } else if (Op1.getOpcode() == ISD::Constant) {
16962 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16963 uint64_t AndRHSVal = AndRHS->getZExtValue();
16964 SDValue AndLHS = Op0;
16966 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16967 LHS = AndLHS.getOperand(0);
16968 RHS = AndLHS.getOperand(1);
16971 // Use BT if the immediate can't be encoded in a TEST instruction.
16972 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
16974 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
16979 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
16984 // Convert (truncate (srl X, N) to i1) to (bt X, N)
16985 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
16986 const SDLoc &dl, SelectionDAG &DAG) {
16988 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
16989 "Expected TRUNCATE to i1 node");
16991 if (Op.getOperand(0).getOpcode() != ISD::SRL)
16994 SDValue ShiftRight = Op.getOperand(0);
16995 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
16999 /// Result of 'and' or 'trunc to i1' is compared against zero.
17000 /// Change to a BT node if possible.
17001 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
17002 const SDLoc &dl, SelectionDAG &DAG) const {
17003 if (Op.getOpcode() == ISD::AND)
17004 return LowerAndToBT(Op, CC, dl, DAG);
17005 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
17006 return LowerTruncateToBT(Op, CC, dl, DAG);
17010 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17012 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17017 // SSE Condition code mapping:
17026 switch (SetCCOpcode) {
17027 default: llvm_unreachable("Unexpected SETCC condition");
17029 case ISD::SETEQ: SSECC = 0; break;
17031 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17033 case ISD::SETOLT: SSECC = 1; break;
17035 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17037 case ISD::SETOLE: SSECC = 2; break;
17038 case ISD::SETUO: SSECC = 3; break;
17040 case ISD::SETNE: SSECC = 4; break;
17041 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17042 case ISD::SETUGE: SSECC = 5; break;
17043 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17044 case ISD::SETUGT: SSECC = 6; break;
17045 case ISD::SETO: SSECC = 7; break;
17047 case ISD::SETONE: SSECC = 8; break;
17050 std::swap(Op0, Op1);
17055 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17056 /// concatenate the result back.
17057 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17058 MVT VT = Op.getSimpleValueType();
17060 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17061 "Unsupported value type for operation");
17063 unsigned NumElems = VT.getVectorNumElements();
17065 SDValue CC = Op.getOperand(2);
17067 // Extract the LHS vectors
17068 SDValue LHS = Op.getOperand(0);
17069 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17070 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17072 // Extract the RHS vectors
17073 SDValue RHS = Op.getOperand(1);
17074 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17075 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17077 // Issue the operation on the smaller types and concatenate the result back
17078 MVT EltVT = VT.getVectorElementType();
17079 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17080 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17081 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17082 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17085 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17086 SDValue Op0 = Op.getOperand(0);
17087 SDValue Op1 = Op.getOperand(1);
17088 SDValue CC = Op.getOperand(2);
17089 MVT VT = Op.getSimpleValueType();
17092 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17093 "Unexpected type for boolean compare operation");
17094 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17095 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17096 DAG.getConstant(-1, dl, VT));
17097 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17098 DAG.getConstant(-1, dl, VT));
17099 switch (SetCCOpcode) {
17100 default: llvm_unreachable("Unexpected SETCC condition");
17102 // (x == y) -> ~(x ^ y)
17103 return DAG.getNode(ISD::XOR, dl, VT,
17104 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17105 DAG.getConstant(-1, dl, VT));
17107 // (x != y) -> (x ^ y)
17108 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17111 // (x > y) -> (x & ~y)
17112 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17115 // (x < y) -> (~x & y)
17116 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17119 // (x <= y) -> (~x | y)
17120 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17123 // (x >=y) -> (x | ~y)
17124 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17128 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17130 SDValue Op0 = Op.getOperand(0);
17131 SDValue Op1 = Op.getOperand(1);
17132 SDValue CC = Op.getOperand(2);
17133 MVT VT = Op.getSimpleValueType();
17136 assert(VT.getVectorElementType() == MVT::i1 &&
17137 "Cannot set masked compare for this operation");
17139 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17141 bool Unsigned = false;
17144 switch (SetCCOpcode) {
17145 default: llvm_unreachable("Unexpected SETCC condition");
17146 case ISD::SETNE: SSECC = 4; break;
17147 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17148 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17149 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17150 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17151 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17152 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17153 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17154 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17155 case ISD::SETLE: SSECC = 2; break;
17159 std::swap(Op0, Op1);
17161 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17162 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17163 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17164 DAG.getConstant(SSECC, dl, MVT::i8));
17167 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17168 /// operand \p Op1. If non-trivial (for example because it's not constant)
17169 /// return an empty value.
17170 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17171 SelectionDAG &DAG) {
17172 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17176 MVT VT = Op1.getSimpleValueType();
17177 MVT EVT = VT.getVectorElementType();
17178 unsigned n = VT.getVectorNumElements();
17179 SmallVector<SDValue, 8> ULTOp1;
17181 for (unsigned i = 0; i < n; ++i) {
17182 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17183 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17186 // Avoid underflow.
17187 APInt Val = Elt->getAPIntValue();
17191 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17194 return DAG.getBuildVector(VT, dl, ULTOp1);
17197 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17198 SelectionDAG &DAG) {
17199 SDValue Op0 = Op.getOperand(0);
17200 SDValue Op1 = Op.getOperand(1);
17201 SDValue CC = Op.getOperand(2);
17202 MVT VT = Op.getSimpleValueType();
17203 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17204 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17209 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17210 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17214 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17215 assert(VT.getVectorNumElements() <= 16);
17216 Opc = X86ISD::CMPM;
17218 Opc = X86ISD::CMPP;
17219 // The SSE/AVX packed FP comparison nodes are defined with a
17220 // floating-point vector result that matches the operand type. This allows
17221 // them to work with an SSE1 target (integer vector types are not legal).
17222 VT = Op0.getSimpleValueType();
17225 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17226 // emit two comparisons and a logic op to tie them together.
17227 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17230 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17232 // LLVM predicate is SETUEQ or SETONE.
17234 unsigned CombineOpc;
17235 if (Cond == ISD::SETUEQ) {
17238 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17239 static_cast<unsigned>(ISD::OR);
17241 assert(Cond == ISD::SETONE);
17244 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17245 static_cast<unsigned>(ISD::AND);
17248 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17249 DAG.getConstant(CC0, dl, MVT::i8));
17250 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17251 DAG.getConstant(CC1, dl, MVT::i8));
17252 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17254 // Handle all other FP comparisons here.
17255 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17256 DAG.getConstant(SSECC, dl, MVT::i8));
17259 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17260 // result type of SETCC. The bitcast is expected to be optimized away
17261 // during combining/isel.
17262 if (Opc == X86ISD::CMPP)
17263 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17268 MVT VTOp0 = Op0.getSimpleValueType();
17269 assert(VTOp0 == Op1.getSimpleValueType() &&
17270 "Expected operands with same type!");
17271 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17272 "Invalid number of packed elements for source and destination!");
17274 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17275 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17276 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17277 // legalizer firstly checks if the first operand in input to the setcc has
17278 // a legal type. If so, then it promotes the return type to that same type.
17279 // Otherwise, the return type is promoted to the 'next legal type' which,
17280 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17282 // We reach this code only if the following two conditions are met:
17283 // 1. Both return type and operand type have been promoted to wider types
17284 // by the type legalizer.
17285 // 2. The original operand type has been promoted to a 256-bit vector.
17287 // Note that condition 2. only applies for AVX targets.
17288 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17289 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17292 // The non-AVX512 code below works under the assumption that source and
17293 // destination types are the same.
17294 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17295 "Value types for source and destination must be the same!");
17297 // Break 256-bit integer vector compare into smaller ones.
17298 if (VT.is256BitVector() && !Subtarget.hasInt256())
17299 return Lower256IntVSETCC(Op, DAG);
17301 // Operands are boolean (vectors of i1)
17302 MVT OpVT = Op1.getSimpleValueType();
17303 if (OpVT.getVectorElementType() == MVT::i1)
17304 return LowerBoolVSETCC_AVX512(Op, DAG);
17306 // The result is boolean, but operands are int/float
17307 if (VT.getVectorElementType() == MVT::i1) {
17308 // In AVX-512 architecture setcc returns mask with i1 elements,
17309 // But there is no compare instruction for i8 and i16 elements in KNL.
17310 // In this case use SSE compare
17311 bool UseAVX512Inst =
17312 (OpVT.is512BitVector() ||
17313 OpVT.getScalarSizeInBits() >= 32 ||
17314 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17317 return LowerIntVSETCC_AVX512(Op, DAG);
17319 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17320 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17323 // Lower using XOP integer comparisons.
17324 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17325 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17326 // Translate compare code to XOP PCOM compare mode.
17327 unsigned CmpMode = 0;
17329 default: llvm_unreachable("Unexpected SETCC condition");
17331 case ISD::SETLT: CmpMode = 0x00; break;
17333 case ISD::SETLE: CmpMode = 0x01; break;
17335 case ISD::SETGT: CmpMode = 0x02; break;
17337 case ISD::SETGE: CmpMode = 0x03; break;
17338 case ISD::SETEQ: CmpMode = 0x04; break;
17339 case ISD::SETNE: CmpMode = 0x05; break;
17342 // Are we comparing unsigned or signed integers?
17344 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17346 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17347 DAG.getConstant(CmpMode, dl, MVT::i8));
17350 // We are handling one of the integer comparisons here. Since SSE only has
17351 // GT and EQ comparisons for integer, swapping operands and multiple
17352 // operations may be required for some comparisons.
17353 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17355 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17356 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17357 bool Invert = Cond == ISD::SETNE ||
17358 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17360 // If both operands are known non-negative, then an unsigned compare is the
17361 // same as a signed compare and there's no need to flip signbits.
17362 // TODO: We could check for more general simplifications here since we're
17363 // computing known bits.
17364 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17365 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17367 // Special case: Use min/max operations for SETULE/SETUGE
17368 MVT VET = VT.getVectorElementType();
17370 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
17371 (Subtarget.hasSSE2() && (VET == MVT::i8));
17372 bool MinMax = false;
17376 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17377 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17381 Swap = Invert = FlipSigns = false;
17384 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17385 bool Subus = false;
17386 if (!MinMax && HasSubus) {
17387 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17389 // t = psubus Op0, Op1
17390 // pcmpeq t, <0..0>
17393 case ISD::SETULT: {
17394 // If the comparison is against a constant we can turn this into a
17395 // setule. With psubus, setule does not require a swap. This is
17396 // beneficial because the constant in the register is no longer
17397 // destructed as the destination so it can be hoisted out of a loop.
17398 // Only do this pre-AVX since vpcmp* is no longer destructive.
17399 if (Subtarget.hasAVX())
17401 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17403 Subus = true; Invert = false; Swap = false;
17407 // Psubus is better than flip-sign because it requires no inversion.
17408 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17409 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17413 Opc = X86ISD::SUBUS;
17419 std::swap(Op0, Op1);
17421 // Check that the operation in question is available (most are plain SSE2,
17422 // but PCMPGTQ and PCMPEQQ have different requirements).
17423 if (VT == MVT::v2i64) {
17424 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17425 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17427 // First cast everything to the right type.
17428 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17429 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17431 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17432 // bits of the inputs before performing those operations. The lower
17433 // compare is always unsigned.
17436 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17438 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17439 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17440 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17442 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17443 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17445 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17446 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17447 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17449 // Create masks for only the low parts/high parts of the 64 bit integers.
17450 static const int MaskHi[] = { 1, 1, 3, 3 };
17451 static const int MaskLo[] = { 0, 0, 2, 2 };
17452 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17453 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17454 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17456 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17457 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17460 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17462 return DAG.getBitcast(VT, Result);
17465 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17466 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17467 // pcmpeqd + pshufd + pand.
17468 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17470 // First cast everything to the right type.
17471 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17472 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17475 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17477 // Make sure the lower and upper halves are both all-ones.
17478 static const int Mask[] = { 1, 0, 3, 2 };
17479 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17480 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17483 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17485 return DAG.getBitcast(VT, Result);
17489 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17490 // bits of the inputs before performing those operations.
17492 MVT EltVT = VT.getVectorElementType();
17493 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17495 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17496 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17499 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17501 // If the logical-not of the result is required, perform that now.
17503 Result = DAG.getNOT(dl, Result, VT);
17506 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17509 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17510 getZeroVector(VT, Subtarget, DAG, dl));
17515 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17517 MVT VT = Op.getSimpleValueType();
17519 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17521 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17522 SDValue Op0 = Op.getOperand(0);
17523 SDValue Op1 = Op.getOperand(1);
17525 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17527 // Optimize to BT if possible.
17528 // Lower (X & (1 << N)) == 0 to BT(X, N).
17529 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17530 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17531 // Lower (trunc (X >> N) to i1) to BT(X, N).
17532 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17533 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17534 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17536 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17541 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17543 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17544 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17546 // If the input is a setcc, then reuse the input setcc or use a new one with
17547 // the inverted condition.
17548 if (Op0.getOpcode() == X86ISD::SETCC) {
17549 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17550 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17554 CCode = X86::GetOppositeBranchCondition(CCode);
17555 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17557 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17561 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17562 if (isOneConstant(Op1)) {
17563 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17564 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17566 if (!isNullConstant(Op1)) {
17567 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17568 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17572 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17573 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17574 if (X86CC == X86::COND_INVALID)
17577 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17578 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17579 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17581 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17585 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17586 SDValue LHS = Op.getOperand(0);
17587 SDValue RHS = Op.getOperand(1);
17588 SDValue Carry = Op.getOperand(2);
17589 SDValue Cond = Op.getOperand(3);
17592 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17593 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17595 // Recreate the carry if needed.
17596 EVT CarryVT = Carry.getValueType();
17597 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17598 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17599 Carry, DAG.getConstant(NegOne, DL, CarryVT));
17601 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17602 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17603 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17604 if (Op.getSimpleValueType() == MVT::i1)
17605 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17609 /// Return true if opcode is a X86 logical comparison.
17610 static bool isX86LogicalCmp(SDValue Op) {
17611 unsigned Opc = Op.getOpcode();
17612 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17613 Opc == X86ISD::SAHF)
17615 if (Op.getResNo() == 1 &&
17616 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17617 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17618 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17619 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17622 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17628 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17629 if (V.getOpcode() != ISD::TRUNCATE)
17632 SDValue VOp0 = V.getOperand(0);
17633 unsigned InBits = VOp0.getValueSizeInBits();
17634 unsigned Bits = V.getValueSizeInBits();
17635 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17638 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17639 bool AddTest = true;
17640 SDValue Cond = Op.getOperand(0);
17641 SDValue Op1 = Op.getOperand(1);
17642 SDValue Op2 = Op.getOperand(2);
17644 MVT VT = Op1.getSimpleValueType();
17647 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17648 // are available or VBLENDV if AVX is available.
17649 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17650 if (Cond.getOpcode() == ISD::SETCC &&
17651 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17652 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17653 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17654 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17655 int SSECC = translateX86FSETCC(
17656 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17659 if (Subtarget.hasAVX512()) {
17660 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17661 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17662 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17663 DL, VT, Cmp, Op1, Op2);
17666 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17667 DAG.getConstant(SSECC, DL, MVT::i8));
17669 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17670 // of 3 logic instructions for size savings and potentially speed.
17671 // Unfortunately, there is no scalar form of VBLENDV.
17673 // If either operand is a constant, don't try this. We can expect to
17674 // optimize away at least one of the logic instructions later in that
17675 // case, so that sequence would be faster than a variable blend.
17677 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17678 // uses XMM0 as the selection register. That may need just as many
17679 // instructions as the AND/ANDN/OR sequence due to register moves, so
17682 if (Subtarget.hasAVX() &&
17683 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17685 // Convert to vectors, do a VSELECT, and convert back to scalar.
17686 // All of the conversions should be optimized away.
17688 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17689 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17690 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17691 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17693 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17694 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17696 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17698 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17699 VSel, DAG.getIntPtrConstant(0, DL));
17701 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17702 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17703 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17707 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17708 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17709 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17710 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17713 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17715 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17716 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17717 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17718 Op1Scalar = Op1.getOperand(0);
17720 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17721 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17722 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17723 Op2Scalar = Op2.getOperand(0);
17724 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17725 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17726 Op1Scalar, Op2Scalar);
17727 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17728 return DAG.getBitcast(VT, newSelect);
17729 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17730 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17731 DAG.getIntPtrConstant(0, DL));
17735 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17736 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17737 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17738 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17739 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17740 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17741 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17742 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17745 if (Cond.getOpcode() == ISD::SETCC) {
17746 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17748 // If the condition was updated, it's possible that the operands of the
17749 // select were also updated (for example, EmitTest has a RAUW). Refresh
17750 // the local references to the select operands in case they got stale.
17751 Op1 = Op.getOperand(1);
17752 Op2 = Op.getOperand(2);
17756 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17757 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17758 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17759 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17760 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17761 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17762 if (Cond.getOpcode() == X86ISD::SETCC &&
17763 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17764 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17765 SDValue Cmp = Cond.getOperand(1);
17766 unsigned CondCode =
17767 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17769 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17770 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17771 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17772 SDValue CmpOp0 = Cmp.getOperand(0);
17774 // Apply further optimizations for special cases
17775 // (select (x != 0), -1, 0) -> neg & sbb
17776 // (select (x == 0), 0, -1) -> neg & sbb
17777 if (isNullConstant(Y) &&
17778 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17779 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17780 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17781 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17782 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17783 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17784 SDValue(Neg.getNode(), 1));
17788 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17789 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17790 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17792 SDValue Res = // Res = 0 or -1.
17793 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17794 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17796 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17797 Res = DAG.getNOT(DL, Res, Res.getValueType());
17799 if (!isNullConstant(Op2))
17800 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17802 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17803 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17804 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17805 SDValue CmpOp0 = Cmp.getOperand(0);
17806 SDValue Src1, Src2;
17807 // true if Op2 is XOR or OR operator and one of its operands
17809 // ( a , a op b) || ( b , a op b)
17810 auto isOrXorPattern = [&]() {
17811 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17812 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17814 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17821 if (isOrXorPattern()) {
17823 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17824 // we need mask of all zeros or ones with same size of the other
17826 if (CmpSz > VT.getSizeInBits())
17827 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17828 else if (CmpSz < VT.getSizeInBits())
17829 Neg = DAG.getNode(ISD::AND, DL, VT,
17830 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17831 DAG.getConstant(1, DL, VT));
17834 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17835 Neg); // -(and (x, 0x1))
17836 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17837 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17842 // Look past (and (setcc_carry (cmp ...)), 1).
17843 if (Cond.getOpcode() == ISD::AND &&
17844 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17845 isOneConstant(Cond.getOperand(1)))
17846 Cond = Cond.getOperand(0);
17848 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17849 // setting operand in place of the X86ISD::SETCC.
17850 unsigned CondOpcode = Cond.getOpcode();
17851 if (CondOpcode == X86ISD::SETCC ||
17852 CondOpcode == X86ISD::SETCC_CARRY) {
17853 CC = Cond.getOperand(0);
17855 SDValue Cmp = Cond.getOperand(1);
17856 unsigned Opc = Cmp.getOpcode();
17857 MVT VT = Op.getSimpleValueType();
17859 bool IllegalFPCMov = false;
17860 if (VT.isFloatingPoint() && !VT.isVector() &&
17861 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17862 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17864 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17865 Opc == X86ISD::BT) { // FIXME
17869 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17870 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17871 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17872 Cond.getOperand(0).getValueType() != MVT::i8)) {
17873 SDValue LHS = Cond.getOperand(0);
17874 SDValue RHS = Cond.getOperand(1);
17875 unsigned X86Opcode;
17878 switch (CondOpcode) {
17879 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17880 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17881 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17882 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17883 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17884 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17885 default: llvm_unreachable("unexpected overflowing operator");
17887 if (CondOpcode == ISD::UMULO)
17888 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17891 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17893 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17895 if (CondOpcode == ISD::UMULO)
17896 Cond = X86Op.getValue(2);
17898 Cond = X86Op.getValue(1);
17900 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17905 // Look past the truncate if the high bits are known zero.
17906 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17907 Cond = Cond.getOperand(0);
17909 // We know the result of AND is compared against zero. Try to match
17911 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17912 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17913 CC = NewSetCC.getOperand(0);
17914 Cond = NewSetCC.getOperand(1);
17921 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17922 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17925 // a < b ? -1 : 0 -> RES = ~setcc_carry
17926 // a < b ? 0 : -1 -> RES = setcc_carry
17927 // a >= b ? -1 : 0 -> RES = setcc_carry
17928 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17929 if (Cond.getOpcode() == X86ISD::SUB) {
17930 Cond = ConvertCmpIfNecessary(Cond, DAG);
17931 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17933 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17934 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17935 (isNullConstant(Op1) || isNullConstant(Op2))) {
17936 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17937 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17939 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17940 return DAG.getNOT(DL, Res, Res.getValueType());
17945 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17946 // widen the cmov and push the truncate through. This avoids introducing a new
17947 // branch during isel and doesn't add any extensions.
17948 if (Op.getValueType() == MVT::i8 &&
17949 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17950 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17951 if (T1.getValueType() == T2.getValueType() &&
17952 // Blacklist CopyFromReg to avoid partial register stalls.
17953 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17954 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17955 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17956 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17960 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17961 // condition is true.
17962 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17963 SDValue Ops[] = { Op2, Op1, CC, Cond };
17964 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17967 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
17968 const X86Subtarget &Subtarget,
17969 SelectionDAG &DAG) {
17970 MVT VT = Op->getSimpleValueType(0);
17971 SDValue In = Op->getOperand(0);
17972 MVT InVT = In.getSimpleValueType();
17973 MVT VTElt = VT.getVectorElementType();
17974 MVT InVTElt = InVT.getVectorElementType();
17978 if ((InVTElt == MVT::i1) &&
17979 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
17981 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
17983 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
17985 unsigned NumElts = VT.getVectorNumElements();
17987 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
17988 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
17989 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
17990 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
17991 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
17994 if (InVTElt != MVT::i1)
17998 if (!VT.is512BitVector() && !Subtarget.hasVLX())
17999 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
18002 if (Subtarget.hasDQI()) {
18003 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
18004 assert(!VT.is512BitVector() && "Unexpected vector type");
18006 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
18007 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
18008 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
18013 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
18016 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18017 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18018 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18019 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18020 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18021 const X86Subtarget &Subtarget,
18022 SelectionDAG &DAG) {
18023 SDValue In = Op->getOperand(0);
18024 MVT VT = Op->getSimpleValueType(0);
18025 MVT InVT = In.getSimpleValueType();
18026 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18028 MVT SVT = VT.getVectorElementType();
18029 MVT InSVT = InVT.getVectorElementType();
18030 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18032 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18034 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18036 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18037 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18038 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18043 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18044 // For 512-bit vectors, we need 128-bits or 256-bits.
18045 if (VT.getSizeInBits() > 128) {
18046 // Input needs to be at least the same number of elements as output, and
18047 // at least 128-bits.
18048 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18049 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18052 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18053 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18055 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18056 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18057 // need to be handled here for 256/512-bit results.
18058 if (Subtarget.hasInt256()) {
18059 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18060 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18061 X86ISD::VSEXT : X86ISD::VZEXT;
18062 return DAG.getNode(ExtOpc, dl, VT, In);
18065 // We should only get here for sign extend.
18066 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18067 "Unexpected opcode!");
18069 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18073 // As SRAI is only available on i16/i32 types, we expand only up to i32
18074 // and handle i64 separately.
18075 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18076 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18077 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18078 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18079 Curr = DAG.getBitcast(CurrVT, Curr);
18082 SDValue SignExt = Curr;
18083 if (CurrVT != InVT) {
18084 unsigned SignExtShift =
18085 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18086 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18087 DAG.getConstant(SignExtShift, dl, MVT::i8));
18093 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18094 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18095 DAG.getConstant(31, dl, MVT::i8));
18096 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18097 return DAG.getBitcast(VT, Ext);
18103 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18104 SelectionDAG &DAG) {
18105 MVT VT = Op->getSimpleValueType(0);
18106 SDValue In = Op->getOperand(0);
18107 MVT InVT = In.getSimpleValueType();
18110 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
18111 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18113 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18114 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18115 (VT != MVT::v16i16 || InVT != MVT::v16i8))
18118 if (Subtarget.hasInt256())
18119 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18121 // Optimize vectors in AVX mode
18122 // Sign extend v8i16 to v8i32 and
18125 // Divide input vector into two parts
18126 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18127 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18128 // concat the vectors to original VT
18130 unsigned NumElems = InVT.getVectorNumElements();
18131 SDValue Undef = DAG.getUNDEF(InVT);
18133 SmallVector<int,8> ShufMask1(NumElems, -1);
18134 for (unsigned i = 0; i != NumElems/2; ++i)
18137 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18139 SmallVector<int,8> ShufMask2(NumElems, -1);
18140 for (unsigned i = 0; i != NumElems/2; ++i)
18141 ShufMask2[i] = i + NumElems/2;
18143 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18145 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18146 VT.getVectorNumElements() / 2);
18148 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18149 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18151 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18154 // Lower truncating store. We need a special lowering to vXi1 vectors
18155 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18156 SelectionDAG &DAG) {
18157 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18159 EVT MemVT = St->getMemoryVT();
18160 assert(St->isTruncatingStore() && "We only custom truncating store.");
18161 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18162 "Expected truncstore of i1 vector");
18164 SDValue Op = St->getValue();
18165 MVT OpVT = Op.getValueType().getSimpleVT();
18166 unsigned NumElts = OpVT.getVectorNumElements();
18167 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18169 // Truncate and store - everything is legal
18170 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18171 if (MemVT.getSizeInBits() < 8)
18172 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18173 DAG.getUNDEF(MVT::v8i1), Op,
18174 DAG.getIntPtrConstant(0, dl));
18175 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18176 St->getMemOperand());
18179 // A subset, assume that we have only AVX-512F
18180 if (NumElts <= 8) {
18182 // Extend to 8-elts vector
18183 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18184 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18185 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18187 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18188 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18189 St->getMemOperand());
18192 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18193 // Divide the vector into 2 parts and store each part separately
18194 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18195 DAG.getIntPtrConstant(0, dl));
18196 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18197 SDValue BasePtr = St->getBasePtr();
18198 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18199 St->getMemOperand());
18200 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18201 DAG.getIntPtrConstant(16, dl));
18202 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18204 SDValue BasePtrHi =
18205 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18206 DAG.getConstant(2, dl, BasePtr.getValueType()));
18208 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18209 BasePtrHi, St->getMemOperand());
18210 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18213 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18214 const X86Subtarget &Subtarget,
18215 SelectionDAG &DAG) {
18217 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18219 EVT MemVT = Ld->getMemoryVT();
18220 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18221 "Expected i1 vector load");
18222 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18223 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18224 MVT VT = Op.getValueType().getSimpleVT();
18225 unsigned NumElts = VT.getVectorNumElements();
18227 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18228 (Subtarget.hasDQI() && NumElts < 16) ||
18230 // Load and extend - everything is legal
18232 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18234 Ld->getMemOperand());
18235 // Replace chain users with the new chain.
18236 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18237 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18238 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18239 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18241 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18242 DAG.getIntPtrConstant(0, dl));
18244 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18246 Ld->getMemOperand());
18247 // Replace chain users with the new chain.
18248 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18249 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18251 // Finally, do a normal sign-extend to the desired register.
18252 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18255 if (NumElts <= 8) {
18256 // A subset, assume that we have only AVX-512F
18257 unsigned NumBitsToLoad = 8;
18258 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18259 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18261 Ld->getMemOperand());
18262 // Replace chain users with the new chain.
18263 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18264 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18266 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18267 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18270 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18272 // we should take care to v4i1 and v2i1
18274 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18275 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18276 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18277 DAG.getIntPtrConstant(0, dl));
18280 assert(VT == MVT::v32i8 && "Unexpected extload type");
18282 SmallVector<SDValue, 2> Chains;
18284 SDValue BasePtr = Ld->getBasePtr();
18285 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18287 Ld->getMemOperand());
18288 Chains.push_back(LoadLo.getValue(1));
18290 SDValue BasePtrHi =
18291 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18292 DAG.getConstant(2, dl, BasePtr.getValueType()));
18294 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18296 Ld->getMemOperand());
18297 Chains.push_back(LoadHi.getValue(1));
18298 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18299 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18301 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18302 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18303 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18306 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18307 // may emit an illegal shuffle but the expansion is still better than scalar
18308 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18309 // we'll emit a shuffle and a arithmetic shift.
18310 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18311 // TODO: It is possible to support ZExt by zeroing the undef values during
18312 // the shuffle phase or after the shuffle.
18313 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18314 SelectionDAG &DAG) {
18315 MVT RegVT = Op.getSimpleValueType();
18316 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18317 assert(RegVT.isInteger() &&
18318 "We only custom lower integer vector sext loads.");
18320 // Nothing useful we can do without SSE2 shuffles.
18321 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18323 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18325 EVT MemVT = Ld->getMemoryVT();
18326 if (MemVT.getScalarType() == MVT::i1)
18327 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18329 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18330 unsigned RegSz = RegVT.getSizeInBits();
18332 ISD::LoadExtType Ext = Ld->getExtensionType();
18334 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18335 && "Only anyext and sext are currently implemented.");
18336 assert(MemVT != RegVT && "Cannot extend to the same type");
18337 assert(MemVT.isVector() && "Must load a vector from memory");
18339 unsigned NumElems = RegVT.getVectorNumElements();
18340 unsigned MemSz = MemVT.getSizeInBits();
18341 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18343 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18344 // The only way in which we have a legal 256-bit vector result but not the
18345 // integer 256-bit operations needed to directly lower a sextload is if we
18346 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18347 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18348 // correctly legalized. We do this late to allow the canonical form of
18349 // sextload to persist throughout the rest of the DAG combiner -- it wants
18350 // to fold together any extensions it can, and so will fuse a sign_extend
18351 // of an sextload into a sextload targeting a wider value.
18353 if (MemSz == 128) {
18354 // Just switch this to a normal load.
18355 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18356 "it must be a legal 128-bit vector "
18358 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18359 Ld->getPointerInfo(), Ld->getAlignment(),
18360 Ld->getMemOperand()->getFlags());
18362 assert(MemSz < 128 &&
18363 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18364 // Do an sext load to a 128-bit vector type. We want to use the same
18365 // number of elements, but elements half as wide. This will end up being
18366 // recursively lowered by this routine, but will succeed as we definitely
18367 // have all the necessary features if we're using AVX1.
18369 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18370 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18372 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18373 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18374 Ld->getMemOperand()->getFlags());
18377 // Replace chain users with the new chain.
18378 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18379 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18381 // Finally, do a normal sign-extend to the desired register.
18382 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18385 // All sizes must be a power of two.
18386 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18387 "Non-power-of-two elements are not custom lowered!");
18389 // Attempt to load the original value using scalar loads.
18390 // Find the largest scalar type that divides the total loaded size.
18391 MVT SclrLoadTy = MVT::i8;
18392 for (MVT Tp : MVT::integer_valuetypes()) {
18393 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18398 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18399 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18401 SclrLoadTy = MVT::f64;
18403 // Calculate the number of scalar loads that we need to perform
18404 // in order to load our vector from memory.
18405 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18407 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18408 "Can only lower sext loads with a single scalar load!");
18410 unsigned loadRegZize = RegSz;
18411 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18414 // Represent our vector as a sequence of elements which are the
18415 // largest scalar that we can load.
18416 EVT LoadUnitVecVT = EVT::getVectorVT(
18417 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18419 // Represent the data using the same element type that is stored in
18420 // memory. In practice, we ''widen'' MemVT.
18422 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18423 loadRegZize / MemVT.getScalarSizeInBits());
18425 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18426 "Invalid vector type");
18428 // We can't shuffle using an illegal type.
18429 assert(TLI.isTypeLegal(WideVecVT) &&
18430 "We only lower types that form legal widened vector types");
18432 SmallVector<SDValue, 8> Chains;
18433 SDValue Ptr = Ld->getBasePtr();
18434 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18435 TLI.getPointerTy(DAG.getDataLayout()));
18436 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18438 for (unsigned i = 0; i < NumLoads; ++i) {
18439 // Perform a single load.
18440 SDValue ScalarLoad =
18441 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18442 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18443 Chains.push_back(ScalarLoad.getValue(1));
18444 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18445 // another round of DAGCombining.
18447 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18449 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18450 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18452 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18455 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18457 // Bitcast the loaded value to a vector of the original element type, in
18458 // the size of the target vector type.
18459 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18460 unsigned SizeRatio = RegSz / MemSz;
18462 if (Ext == ISD::SEXTLOAD) {
18463 // If we have SSE4.1, we can directly emit a VSEXT node.
18464 if (Subtarget.hasSSE41()) {
18465 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18466 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18470 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18472 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18473 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18475 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18476 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18480 // Redistribute the loaded elements into the different locations.
18481 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18482 for (unsigned i = 0; i != NumElems; ++i)
18483 ShuffleVec[i * SizeRatio] = i;
18485 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18486 DAG.getUNDEF(WideVecVT), ShuffleVec);
18488 // Bitcast to the requested type.
18489 Shuff = DAG.getBitcast(RegVT, Shuff);
18490 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18494 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18495 /// each of which has no other use apart from the AND / OR.
18496 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18497 Opc = Op.getOpcode();
18498 if (Opc != ISD::OR && Opc != ISD::AND)
18500 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18501 Op.getOperand(0).hasOneUse() &&
18502 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18503 Op.getOperand(1).hasOneUse());
18506 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18507 /// SETCC node has a single use.
18508 static bool isXor1OfSetCC(SDValue Op) {
18509 if (Op.getOpcode() != ISD::XOR)
18511 if (isOneConstant(Op.getOperand(1)))
18512 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18513 Op.getOperand(0).hasOneUse();
18517 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18518 bool addTest = true;
18519 SDValue Chain = Op.getOperand(0);
18520 SDValue Cond = Op.getOperand(1);
18521 SDValue Dest = Op.getOperand(2);
18524 bool Inverted = false;
18526 if (Cond.getOpcode() == ISD::SETCC) {
18527 // Check for setcc([su]{add,sub,mul}o == 0).
18528 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18529 isNullConstant(Cond.getOperand(1)) &&
18530 Cond.getOperand(0).getResNo() == 1 &&
18531 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18532 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18533 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18534 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18535 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18536 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18538 Cond = Cond.getOperand(0);
18540 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18545 // FIXME: LowerXALUO doesn't handle these!!
18546 else if (Cond.getOpcode() == X86ISD::ADD ||
18547 Cond.getOpcode() == X86ISD::SUB ||
18548 Cond.getOpcode() == X86ISD::SMUL ||
18549 Cond.getOpcode() == X86ISD::UMUL)
18550 Cond = LowerXALUO(Cond, DAG);
18553 // Look pass (and (setcc_carry (cmp ...)), 1).
18554 if (Cond.getOpcode() == ISD::AND &&
18555 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18556 isOneConstant(Cond.getOperand(1)))
18557 Cond = Cond.getOperand(0);
18559 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18560 // setting operand in place of the X86ISD::SETCC.
18561 unsigned CondOpcode = Cond.getOpcode();
18562 if (CondOpcode == X86ISD::SETCC ||
18563 CondOpcode == X86ISD::SETCC_CARRY) {
18564 CC = Cond.getOperand(0);
18566 SDValue Cmp = Cond.getOperand(1);
18567 unsigned Opc = Cmp.getOpcode();
18568 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18569 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18573 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18577 // These can only come from an arithmetic instruction with overflow,
18578 // e.g. SADDO, UADDO.
18579 Cond = Cond.getOperand(1);
18585 CondOpcode = Cond.getOpcode();
18586 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18587 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18588 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18589 Cond.getOperand(0).getValueType() != MVT::i8)) {
18590 SDValue LHS = Cond.getOperand(0);
18591 SDValue RHS = Cond.getOperand(1);
18592 unsigned X86Opcode;
18595 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18596 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18598 switch (CondOpcode) {
18599 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18601 if (isOneConstant(RHS)) {
18602 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18605 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18606 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18608 if (isOneConstant(RHS)) {
18609 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18612 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18613 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18614 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18615 default: llvm_unreachable("unexpected overflowing operator");
18618 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18619 if (CondOpcode == ISD::UMULO)
18620 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18623 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18625 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18627 if (CondOpcode == ISD::UMULO)
18628 Cond = X86Op.getValue(2);
18630 Cond = X86Op.getValue(1);
18632 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18636 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18637 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18638 if (CondOpc == ISD::OR) {
18639 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18640 // two branches instead of an explicit OR instruction with a
18642 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18643 isX86LogicalCmp(Cmp)) {
18644 CC = Cond.getOperand(0).getOperand(0);
18645 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18646 Chain, Dest, CC, Cmp);
18647 CC = Cond.getOperand(1).getOperand(0);
18651 } else { // ISD::AND
18652 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18653 // two branches instead of an explicit AND instruction with a
18654 // separate test. However, we only do this if this block doesn't
18655 // have a fall-through edge, because this requires an explicit
18656 // jmp when the condition is false.
18657 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18658 isX86LogicalCmp(Cmp) &&
18659 Op.getNode()->hasOneUse()) {
18660 X86::CondCode CCode =
18661 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18662 CCode = X86::GetOppositeBranchCondition(CCode);
18663 CC = DAG.getConstant(CCode, dl, MVT::i8);
18664 SDNode *User = *Op.getNode()->use_begin();
18665 // Look for an unconditional branch following this conditional branch.
18666 // We need this because we need to reverse the successors in order
18667 // to implement FCMP_OEQ.
18668 if (User->getOpcode() == ISD::BR) {
18669 SDValue FalseBB = User->getOperand(1);
18671 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18672 assert(NewBR == User);
18676 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18677 Chain, Dest, CC, Cmp);
18678 X86::CondCode CCode =
18679 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18680 CCode = X86::GetOppositeBranchCondition(CCode);
18681 CC = DAG.getConstant(CCode, dl, MVT::i8);
18687 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18688 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18689 // It should be transformed during dag combiner except when the condition
18690 // is set by a arithmetics with overflow node.
18691 X86::CondCode CCode =
18692 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18693 CCode = X86::GetOppositeBranchCondition(CCode);
18694 CC = DAG.getConstant(CCode, dl, MVT::i8);
18695 Cond = Cond.getOperand(0).getOperand(1);
18697 } else if (Cond.getOpcode() == ISD::SETCC &&
18698 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18699 // For FCMP_OEQ, we can emit
18700 // two branches instead of an explicit AND instruction with a
18701 // separate test. However, we only do this if this block doesn't
18702 // have a fall-through edge, because this requires an explicit
18703 // jmp when the condition is false.
18704 if (Op.getNode()->hasOneUse()) {
18705 SDNode *User = *Op.getNode()->use_begin();
18706 // Look for an unconditional branch following this conditional branch.
18707 // We need this because we need to reverse the successors in order
18708 // to implement FCMP_OEQ.
18709 if (User->getOpcode() == ISD::BR) {
18710 SDValue FalseBB = User->getOperand(1);
18712 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18713 assert(NewBR == User);
18717 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18718 Cond.getOperand(0), Cond.getOperand(1));
18719 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18720 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18721 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18722 Chain, Dest, CC, Cmp);
18723 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18728 } else if (Cond.getOpcode() == ISD::SETCC &&
18729 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18730 // For FCMP_UNE, we can emit
18731 // two branches instead of an explicit AND instruction with a
18732 // separate test. However, we only do this if this block doesn't
18733 // have a fall-through edge, because this requires an explicit
18734 // jmp when the condition is false.
18735 if (Op.getNode()->hasOneUse()) {
18736 SDNode *User = *Op.getNode()->use_begin();
18737 // Look for an unconditional branch following this conditional branch.
18738 // We need this because we need to reverse the successors in order
18739 // to implement FCMP_UNE.
18740 if (User->getOpcode() == ISD::BR) {
18741 SDValue FalseBB = User->getOperand(1);
18743 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18744 assert(NewBR == User);
18747 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18748 Cond.getOperand(0), Cond.getOperand(1));
18749 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18750 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18751 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18752 Chain, Dest, CC, Cmp);
18753 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18763 // Look pass the truncate if the high bits are known zero.
18764 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18765 Cond = Cond.getOperand(0);
18767 // We know the result is compared against zero. Try to match it to BT.
18768 if (Cond.hasOneUse()) {
18769 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18770 CC = NewSetCC.getOperand(0);
18771 Cond = NewSetCC.getOperand(1);
18778 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18779 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18780 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18782 Cond = ConvertCmpIfNecessary(Cond, DAG);
18783 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18784 Chain, Dest, CC, Cond);
18787 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18788 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18789 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18790 // that the guard pages used by the OS virtual memory manager are allocated in
18791 // correct sequence.
18793 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18794 SelectionDAG &DAG) const {
18795 MachineFunction &MF = DAG.getMachineFunction();
18796 bool SplitStack = MF.shouldSplitStack();
18797 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18798 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18799 SplitStack || EmitStackProbe;
18803 SDNode *Node = Op.getNode();
18804 SDValue Chain = Op.getOperand(0);
18805 SDValue Size = Op.getOperand(1);
18806 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18807 EVT VT = Node->getValueType(0);
18809 // Chain the dynamic stack allocation so that it doesn't modify the stack
18810 // pointer when other instructions are using the stack.
18811 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18813 bool Is64Bit = Subtarget.is64Bit();
18814 MVT SPTy = getPointerTy(DAG.getDataLayout());
18818 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18819 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18820 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18821 " not tell us which reg is the stack pointer!");
18823 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18824 Chain = SP.getValue(1);
18825 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18826 unsigned StackAlign = TFI.getStackAlignment();
18827 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18828 if (Align > StackAlign)
18829 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18830 DAG.getConstant(-(uint64_t)Align, dl, VT));
18831 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18832 } else if (SplitStack) {
18833 MachineRegisterInfo &MRI = MF.getRegInfo();
18836 // The 64 bit implementation of segmented stacks needs to clobber both r10
18837 // r11. This makes it impossible to use it along with nested parameters.
18838 const Function *F = MF.getFunction();
18839 for (const auto &A : F->args()) {
18840 if (A.hasNestAttr())
18841 report_fatal_error("Cannot use segmented stacks with functions that "
18842 "have nested arguments.");
18846 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18847 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18848 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18849 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18850 DAG.getRegister(Vreg, SPTy));
18852 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18853 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18854 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18856 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18857 unsigned SPReg = RegInfo->getStackRegister();
18858 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18859 Chain = SP.getValue(1);
18862 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18863 DAG.getConstant(-(uint64_t)Align, dl, VT));
18864 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18870 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18871 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18873 SDValue Ops[2] = {Result, Chain};
18874 return DAG.getMergeValues(Ops, dl);
18877 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18878 MachineFunction &MF = DAG.getMachineFunction();
18879 auto PtrVT = getPointerTy(MF.getDataLayout());
18880 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18882 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18885 if (!Subtarget.is64Bit() ||
18886 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18887 // vastart just stores the address of the VarArgsFrameIndex slot into the
18888 // memory location argument.
18889 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18890 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18891 MachinePointerInfo(SV));
18895 // gp_offset (0 - 6 * 8)
18896 // fp_offset (48 - 48 + 8 * 16)
18897 // overflow_arg_area (point to parameters coming in memory).
18899 SmallVector<SDValue, 8> MemOps;
18900 SDValue FIN = Op.getOperand(1);
18902 SDValue Store = DAG.getStore(
18903 Op.getOperand(0), DL,
18904 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18905 MachinePointerInfo(SV));
18906 MemOps.push_back(Store);
18909 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18910 Store = DAG.getStore(
18911 Op.getOperand(0), DL,
18912 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18913 MachinePointerInfo(SV, 4));
18914 MemOps.push_back(Store);
18916 // Store ptr to overflow_arg_area
18917 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18918 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18920 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18921 MemOps.push_back(Store);
18923 // Store ptr to reg_save_area.
18924 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18925 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18926 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18927 Store = DAG.getStore(
18928 Op.getOperand(0), DL, RSFIN, FIN,
18929 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18930 MemOps.push_back(Store);
18931 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18934 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18935 assert(Subtarget.is64Bit() &&
18936 "LowerVAARG only handles 64-bit va_arg!");
18937 assert(Op.getNumOperands() == 4);
18939 MachineFunction &MF = DAG.getMachineFunction();
18940 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18941 // The Win64 ABI uses char* instead of a structure.
18942 return DAG.expandVAArg(Op.getNode());
18944 SDValue Chain = Op.getOperand(0);
18945 SDValue SrcPtr = Op.getOperand(1);
18946 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18947 unsigned Align = Op.getConstantOperandVal(3);
18950 EVT ArgVT = Op.getNode()->getValueType(0);
18951 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18952 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18955 // Decide which area this value should be read from.
18956 // TODO: Implement the AMD64 ABI in its entirety. This simple
18957 // selection mechanism works only for the basic types.
18958 if (ArgVT == MVT::f80) {
18959 llvm_unreachable("va_arg for f80 not yet implemented");
18960 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18961 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18962 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18963 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18965 llvm_unreachable("Unhandled argument type in LowerVAARG");
18968 if (ArgMode == 2) {
18969 // Sanity Check: Make sure using fp_offset makes sense.
18970 assert(!Subtarget.useSoftFloat() &&
18971 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
18972 Subtarget.hasSSE1());
18975 // Insert VAARG_64 node into the DAG
18976 // VAARG_64 returns two values: Variable Argument Address, Chain
18977 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
18978 DAG.getConstant(ArgMode, dl, MVT::i8),
18979 DAG.getConstant(Align, dl, MVT::i32)};
18980 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
18981 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
18982 VTs, InstOps, MVT::i64,
18983 MachinePointerInfo(SV),
18985 /*Volatile=*/false,
18987 /*WriteMem=*/true);
18988 Chain = VAARG.getValue(1);
18990 // Load the next argument and return it
18991 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
18994 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
18995 SelectionDAG &DAG) {
18996 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
18997 // where a va_list is still an i8*.
18998 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
18999 if (Subtarget.isCallingConvWin64(
19000 DAG.getMachineFunction().getFunction()->getCallingConv()))
19001 // Probably a Win64 va_copy.
19002 return DAG.expandVACopy(Op.getNode());
19004 SDValue Chain = Op.getOperand(0);
19005 SDValue DstPtr = Op.getOperand(1);
19006 SDValue SrcPtr = Op.getOperand(2);
19007 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19008 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19011 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19012 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19014 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19017 /// Handle vector element shifts where the shift amount is a constant.
19018 /// Takes immediate version of shift as input.
19019 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19020 SDValue SrcOp, uint64_t ShiftAmt,
19021 SelectionDAG &DAG) {
19022 MVT ElementType = VT.getVectorElementType();
19024 // Bitcast the source vector to the output type, this is mainly necessary for
19025 // vXi8/vXi64 shifts.
19026 if (VT != SrcOp.getSimpleValueType())
19027 SrcOp = DAG.getBitcast(VT, SrcOp);
19029 // Fold this packed shift into its first operand if ShiftAmt is 0.
19033 // Check for ShiftAmt >= element width
19034 if (ShiftAmt >= ElementType.getSizeInBits()) {
19035 if (Opc == X86ISD::VSRAI)
19036 ShiftAmt = ElementType.getSizeInBits() - 1;
19038 return DAG.getConstant(0, dl, VT);
19041 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19042 && "Unknown target vector shift-by-constant node");
19044 // Fold this packed vector shift into a build vector if SrcOp is a
19045 // vector of Constants or UNDEFs.
19046 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19047 SmallVector<SDValue, 8> Elts;
19048 unsigned NumElts = SrcOp->getNumOperands();
19049 ConstantSDNode *ND;
19052 default: llvm_unreachable("Unknown opcode!");
19053 case X86ISD::VSHLI:
19054 for (unsigned i=0; i!=NumElts; ++i) {
19055 SDValue CurrentOp = SrcOp->getOperand(i);
19056 if (CurrentOp->isUndef()) {
19057 Elts.push_back(CurrentOp);
19060 ND = cast<ConstantSDNode>(CurrentOp);
19061 const APInt &C = ND->getAPIntValue();
19062 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19065 case X86ISD::VSRLI:
19066 for (unsigned i=0; i!=NumElts; ++i) {
19067 SDValue CurrentOp = SrcOp->getOperand(i);
19068 if (CurrentOp->isUndef()) {
19069 Elts.push_back(CurrentOp);
19072 ND = cast<ConstantSDNode>(CurrentOp);
19073 const APInt &C = ND->getAPIntValue();
19074 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19077 case X86ISD::VSRAI:
19078 for (unsigned i=0; i!=NumElts; ++i) {
19079 SDValue CurrentOp = SrcOp->getOperand(i);
19080 if (CurrentOp->isUndef()) {
19081 Elts.push_back(CurrentOp);
19084 ND = cast<ConstantSDNode>(CurrentOp);
19085 const APInt &C = ND->getAPIntValue();
19086 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19091 return DAG.getBuildVector(VT, dl, Elts);
19094 return DAG.getNode(Opc, dl, VT, SrcOp,
19095 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19098 /// Handle vector element shifts where the shift amount may or may not be a
19099 /// constant. Takes immediate version of shift as input.
19100 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19101 SDValue SrcOp, SDValue ShAmt,
19102 const X86Subtarget &Subtarget,
19103 SelectionDAG &DAG) {
19104 MVT SVT = ShAmt.getSimpleValueType();
19105 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19107 // Catch shift-by-constant.
19108 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19109 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19110 CShAmt->getZExtValue(), DAG);
19112 // Change opcode to non-immediate version
19114 default: llvm_unreachable("Unknown target vector shift node");
19115 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19116 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19117 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19120 // Need to build a vector containing shift amount.
19121 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19122 // +=================+============+=======================================+
19123 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19124 // +=================+============+=======================================+
19125 // | i64 | Yes, No | Use ShAmt as lowest elt |
19126 // | i32 | Yes | zero-extend in-reg |
19127 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19128 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19129 // +=================+============+=======================================+
19131 if (SVT == MVT::i64)
19132 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19133 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19134 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19135 ShAmt = ShAmt.getOperand(0);
19136 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19137 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19138 } else if (Subtarget.hasSSE41() &&
19139 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19140 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19141 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19143 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
19144 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19145 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19148 // The return type has to be a 128-bit type with the same element
19149 // type as the input type.
19150 MVT EltVT = VT.getVectorElementType();
19151 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19153 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19154 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19157 /// \brief Return Mask with the necessary casting or extending
19158 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19159 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19160 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19163 if (isAllOnesConstant(Mask))
19164 return DAG.getTargetConstant(1, dl, MaskVT);
19165 if (X86::isZeroNode(Mask))
19166 return DAG.getTargetConstant(0, dl, MaskVT);
19168 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19169 // Mask should be extended
19170 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19171 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19174 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19175 if (MaskVT == MVT::v64i1) {
19176 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19177 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19179 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19180 DAG.getConstant(0, dl, MVT::i32));
19181 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19182 DAG.getConstant(1, dl, MVT::i32));
19184 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19185 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19187 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19189 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19191 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19192 return DAG.getBitcast(MaskVT,
19193 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19197 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19198 Mask.getSimpleValueType().getSizeInBits());
19199 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19200 // are extracted by EXTRACT_SUBVECTOR.
19201 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19202 DAG.getBitcast(BitcastVT, Mask),
19203 DAG.getIntPtrConstant(0, dl));
19207 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19208 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19209 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19210 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19211 SDValue PreservedSrc,
19212 const X86Subtarget &Subtarget,
19213 SelectionDAG &DAG) {
19214 MVT VT = Op.getSimpleValueType();
19215 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19216 unsigned OpcodeSelect = ISD::VSELECT;
19219 if (isAllOnesConstant(Mask))
19222 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19224 switch (Op.getOpcode()) {
19226 case X86ISD::PCMPEQM:
19227 case X86ISD::PCMPGTM:
19229 case X86ISD::CMPMU:
19230 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19231 case X86ISD::VFPCLASS:
19232 case X86ISD::VFPCLASSS:
19233 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19234 case X86ISD::VTRUNC:
19235 case X86ISD::VTRUNCS:
19236 case X86ISD::VTRUNCUS:
19237 case X86ISD::CVTPS2PH:
19238 // We can't use ISD::VSELECT here because it is not always "Legal"
19239 // for the destination type. For example vpmovqb require only AVX512
19240 // and vselect that can operate on byte element type require BWI
19241 OpcodeSelect = X86ISD::SELECT;
19244 if (PreservedSrc.isUndef())
19245 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19246 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19249 /// \brief Creates an SDNode for a predicated scalar operation.
19250 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19251 /// The mask is coming as MVT::i8 and it should be transformed
19252 /// to MVT::v1i1 while lowering masking intrinsics.
19253 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19254 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19255 /// for a scalar instruction.
19256 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19257 SDValue PreservedSrc,
19258 const X86Subtarget &Subtarget,
19259 SelectionDAG &DAG) {
19261 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19262 if (MaskConst->getZExtValue() & 0x1)
19265 MVT VT = Op.getSimpleValueType();
19268 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19269 if (Op.getOpcode() == X86ISD::FSETCCM ||
19270 Op.getOpcode() == X86ISD::FSETCCM_RND)
19271 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19272 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19273 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19275 if (PreservedSrc.isUndef())
19276 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19277 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19280 static int getSEHRegistrationNodeSize(const Function *Fn) {
19281 if (!Fn->hasPersonalityFn())
19282 report_fatal_error(
19283 "querying registration node size for function without personality");
19284 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19285 // WinEHStatePass for the full struct definition.
19286 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19287 case EHPersonality::MSVC_X86SEH: return 24;
19288 case EHPersonality::MSVC_CXX: return 16;
19291 report_fatal_error(
19292 "can only recover FP for 32-bit MSVC EH personality functions");
19295 /// When the MSVC runtime transfers control to us, either to an outlined
19296 /// function or when returning to a parent frame after catching an exception, we
19297 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19298 /// Here's the math:
19299 /// RegNodeBase = EntryEBP - RegNodeSize
19300 /// ParentFP = RegNodeBase - ParentFrameOffset
19301 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19302 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19303 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19304 SDValue EntryEBP) {
19305 MachineFunction &MF = DAG.getMachineFunction();
19308 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19309 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19311 // It's possible that the parent function no longer has a personality function
19312 // if the exceptional code was optimized away, in which case we just return
19313 // the incoming EBP.
19314 if (!Fn->hasPersonalityFn())
19317 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19318 // registration, or the .set_setframe offset.
19319 MCSymbol *OffsetSym =
19320 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19321 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19322 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19323 SDValue ParentFrameOffset =
19324 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19326 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19327 // prologue to RBP in the parent function.
19328 const X86Subtarget &Subtarget =
19329 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19330 if (Subtarget.is64Bit())
19331 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19333 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19334 // RegNodeBase = EntryEBP - RegNodeSize
19335 // ParentFP = RegNodeBase - ParentFrameOffset
19336 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19337 DAG.getConstant(RegNodeSize, dl, PtrVT));
19338 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19341 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19342 SelectionDAG &DAG) {
19343 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19344 auto isRoundModeCurDirection = [](SDValue Rnd) {
19345 if (!isa<ConstantSDNode>(Rnd))
19348 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19349 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19353 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19354 MVT VT = Op.getSimpleValueType();
19355 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19357 switch(IntrData->Type) {
19358 case INTR_TYPE_1OP:
19359 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19360 case INTR_TYPE_2OP:
19361 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19363 case INTR_TYPE_3OP:
19364 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19365 Op.getOperand(2), Op.getOperand(3));
19366 case INTR_TYPE_4OP:
19367 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19368 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19369 case INTR_TYPE_1OP_MASK_RM: {
19370 SDValue Src = Op.getOperand(1);
19371 SDValue PassThru = Op.getOperand(2);
19372 SDValue Mask = Op.getOperand(3);
19373 SDValue RoundingMode;
19374 // We always add rounding mode to the Node.
19375 // If the rounding mode is not specified, we add the
19376 // "current direction" mode.
19377 if (Op.getNumOperands() == 4)
19379 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19381 RoundingMode = Op.getOperand(4);
19382 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19383 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19385 Mask, PassThru, Subtarget, DAG);
19387 case INTR_TYPE_1OP_MASK: {
19388 SDValue Src = Op.getOperand(1);
19389 SDValue PassThru = Op.getOperand(2);
19390 SDValue Mask = Op.getOperand(3);
19391 // We add rounding mode to the Node when
19392 // - RM Opcode is specified and
19393 // - RM is not "current direction".
19394 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19395 if (IntrWithRoundingModeOpcode != 0) {
19396 SDValue Rnd = Op.getOperand(4);
19397 if (!isRoundModeCurDirection(Rnd)) {
19398 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19399 dl, Op.getValueType(),
19401 Mask, PassThru, Subtarget, DAG);
19404 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19405 Mask, PassThru, Subtarget, DAG);
19407 case INTR_TYPE_SCALAR_MASK: {
19408 SDValue Src1 = Op.getOperand(1);
19409 SDValue Src2 = Op.getOperand(2);
19410 SDValue passThru = Op.getOperand(3);
19411 SDValue Mask = Op.getOperand(4);
19412 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19413 if (IntrWithRoundingModeOpcode != 0) {
19414 SDValue Rnd = Op.getOperand(5);
19415 if (!isRoundModeCurDirection(Rnd))
19416 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19417 dl, VT, Src1, Src2, Rnd),
19418 Mask, passThru, Subtarget, DAG);
19420 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19421 Mask, passThru, Subtarget, DAG);
19423 case INTR_TYPE_SCALAR_MASK_RM: {
19424 SDValue Src1 = Op.getOperand(1);
19425 SDValue Src2 = Op.getOperand(2);
19426 SDValue Src0 = Op.getOperand(3);
19427 SDValue Mask = Op.getOperand(4);
19428 // There are 2 kinds of intrinsics in this group:
19429 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19430 // (2) With rounding mode and sae - 7 operands.
19431 if (Op.getNumOperands() == 6) {
19432 SDValue Sae = Op.getOperand(5);
19433 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19435 Mask, Src0, Subtarget, DAG);
19437 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19438 SDValue RoundingMode = Op.getOperand(5);
19439 SDValue Sae = Op.getOperand(6);
19440 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19441 RoundingMode, Sae),
19442 Mask, Src0, Subtarget, DAG);
19444 case INTR_TYPE_2OP_MASK:
19445 case INTR_TYPE_2OP_IMM8_MASK: {
19446 SDValue Src1 = Op.getOperand(1);
19447 SDValue Src2 = Op.getOperand(2);
19448 SDValue PassThru = Op.getOperand(3);
19449 SDValue Mask = Op.getOperand(4);
19451 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19452 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19454 // We specify 2 possible opcodes for intrinsics with rounding modes.
19455 // First, we check if the intrinsic may have non-default rounding mode,
19456 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19457 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19458 if (IntrWithRoundingModeOpcode != 0) {
19459 SDValue Rnd = Op.getOperand(5);
19460 if (!isRoundModeCurDirection(Rnd)) {
19461 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19462 dl, Op.getValueType(),
19464 Mask, PassThru, Subtarget, DAG);
19467 // TODO: Intrinsics should have fast-math-flags to propagate.
19468 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19469 Mask, PassThru, Subtarget, DAG);
19471 case INTR_TYPE_2OP_MASK_RM: {
19472 SDValue Src1 = Op.getOperand(1);
19473 SDValue Src2 = Op.getOperand(2);
19474 SDValue PassThru = Op.getOperand(3);
19475 SDValue Mask = Op.getOperand(4);
19476 // We specify 2 possible modes for intrinsics, with/without rounding
19478 // First, we check if the intrinsic have rounding mode (6 operands),
19479 // if not, we set rounding mode to "current".
19481 if (Op.getNumOperands() == 6)
19482 Rnd = Op.getOperand(5);
19484 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19485 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19487 Mask, PassThru, Subtarget, DAG);
19489 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19490 SDValue Src1 = Op.getOperand(1);
19491 SDValue Src2 = Op.getOperand(2);
19492 SDValue Src3 = Op.getOperand(3);
19493 SDValue PassThru = Op.getOperand(4);
19494 SDValue Mask = Op.getOperand(5);
19495 SDValue Sae = Op.getOperand(6);
19497 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19499 Mask, PassThru, Subtarget, DAG);
19501 case INTR_TYPE_3OP_MASK_RM: {
19502 SDValue Src1 = Op.getOperand(1);
19503 SDValue Src2 = Op.getOperand(2);
19504 SDValue Imm = Op.getOperand(3);
19505 SDValue PassThru = Op.getOperand(4);
19506 SDValue Mask = Op.getOperand(5);
19507 // We specify 2 possible modes for intrinsics, with/without rounding
19509 // First, we check if the intrinsic have rounding mode (7 operands),
19510 // if not, we set rounding mode to "current".
19512 if (Op.getNumOperands() == 7)
19513 Rnd = Op.getOperand(6);
19515 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19516 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19517 Src1, Src2, Imm, Rnd),
19518 Mask, PassThru, Subtarget, DAG);
19520 case INTR_TYPE_3OP_IMM8_MASK:
19521 case INTR_TYPE_3OP_MASK: {
19522 SDValue Src1 = Op.getOperand(1);
19523 SDValue Src2 = Op.getOperand(2);
19524 SDValue Src3 = Op.getOperand(3);
19525 SDValue PassThru = Op.getOperand(4);
19526 SDValue Mask = Op.getOperand(5);
19528 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19529 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19531 // We specify 2 possible opcodes for intrinsics with rounding modes.
19532 // First, we check if the intrinsic may have non-default rounding mode,
19533 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19534 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19535 if (IntrWithRoundingModeOpcode != 0) {
19536 SDValue Rnd = Op.getOperand(6);
19537 if (!isRoundModeCurDirection(Rnd)) {
19538 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19539 dl, Op.getValueType(),
19540 Src1, Src2, Src3, Rnd),
19541 Mask, PassThru, Subtarget, DAG);
19544 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19546 Mask, PassThru, Subtarget, DAG);
19548 case VPERM_2OP_MASK : {
19549 SDValue Src1 = Op.getOperand(1);
19550 SDValue Src2 = Op.getOperand(2);
19551 SDValue PassThru = Op.getOperand(3);
19552 SDValue Mask = Op.getOperand(4);
19554 // Swap Src1 and Src2 in the node creation
19555 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19556 Mask, PassThru, Subtarget, DAG);
19558 case VPERM_3OP_MASKZ:
19559 case VPERM_3OP_MASK:{
19560 MVT VT = Op.getSimpleValueType();
19561 // Src2 is the PassThru
19562 SDValue Src1 = Op.getOperand(1);
19563 // PassThru needs to be the same type as the destination in order
19564 // to pattern match correctly.
19565 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19566 SDValue Src3 = Op.getOperand(3);
19567 SDValue Mask = Op.getOperand(4);
19568 SDValue PassThru = SDValue();
19570 // set PassThru element
19571 if (IntrData->Type == VPERM_3OP_MASKZ)
19572 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19576 // Swap Src1 and Src2 in the node creation
19577 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19578 dl, Op.getValueType(),
19580 Mask, PassThru, Subtarget, DAG);
19584 case FMA_OP_MASK: {
19585 SDValue Src1 = Op.getOperand(1);
19586 SDValue Src2 = Op.getOperand(2);
19587 SDValue Src3 = Op.getOperand(3);
19588 SDValue Mask = Op.getOperand(4);
19589 MVT VT = Op.getSimpleValueType();
19590 SDValue PassThru = SDValue();
19592 // set PassThru element
19593 if (IntrData->Type == FMA_OP_MASKZ)
19594 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19595 else if (IntrData->Type == FMA_OP_MASK3)
19600 // We specify 2 possible opcodes for intrinsics with rounding modes.
19601 // First, we check if the intrinsic may have non-default rounding mode,
19602 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19603 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19604 if (IntrWithRoundingModeOpcode != 0) {
19605 SDValue Rnd = Op.getOperand(5);
19606 if (!isRoundModeCurDirection(Rnd))
19607 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19608 dl, Op.getValueType(),
19609 Src1, Src2, Src3, Rnd),
19610 Mask, PassThru, Subtarget, DAG);
19612 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19613 dl, Op.getValueType(),
19615 Mask, PassThru, Subtarget, DAG);
19617 case FMA_OP_SCALAR_MASK:
19618 case FMA_OP_SCALAR_MASK3:
19619 case FMA_OP_SCALAR_MASKZ: {
19620 SDValue Src1 = Op.getOperand(1);
19621 SDValue Src2 = Op.getOperand(2);
19622 SDValue Src3 = Op.getOperand(3);
19623 SDValue Mask = Op.getOperand(4);
19624 MVT VT = Op.getSimpleValueType();
19625 SDValue PassThru = SDValue();
19627 // set PassThru element
19628 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19629 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19630 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19635 SDValue Rnd = Op.getOperand(5);
19636 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19637 Op.getValueType(), Src1, Src2,
19639 Mask, PassThru, Subtarget, DAG);
19641 case TERLOG_OP_MASK:
19642 case TERLOG_OP_MASKZ: {
19643 SDValue Src1 = Op.getOperand(1);
19644 SDValue Src2 = Op.getOperand(2);
19645 SDValue Src3 = Op.getOperand(3);
19646 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19647 SDValue Mask = Op.getOperand(5);
19648 MVT VT = Op.getSimpleValueType();
19649 SDValue PassThru = Src1;
19650 // Set PassThru element.
19651 if (IntrData->Type == TERLOG_OP_MASKZ)
19652 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19654 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19655 Src1, Src2, Src3, Src4),
19656 Mask, PassThru, Subtarget, DAG);
19659 // ISD::FP_ROUND has a second argument that indicates if the truncation
19660 // does not change the value. Set it to 0 since it can change.
19661 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19662 DAG.getIntPtrConstant(0, dl));
19663 case CVTPD2PS_MASK: {
19664 SDValue Src = Op.getOperand(1);
19665 SDValue PassThru = Op.getOperand(2);
19666 SDValue Mask = Op.getOperand(3);
19667 // We add rounding mode to the Node when
19668 // - RM Opcode is specified and
19669 // - RM is not "current direction".
19670 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19671 if (IntrWithRoundingModeOpcode != 0) {
19672 SDValue Rnd = Op.getOperand(4);
19673 if (!isRoundModeCurDirection(Rnd)) {
19674 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19675 dl, Op.getValueType(),
19677 Mask, PassThru, Subtarget, DAG);
19680 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19681 // ISD::FP_ROUND has a second argument that indicates if the truncation
19682 // does not change the value. Set it to 0 since it can change.
19683 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19684 DAG.getIntPtrConstant(0, dl)),
19685 Mask, PassThru, Subtarget, DAG);
19688 // FPclass intrinsics with mask
19689 SDValue Src1 = Op.getOperand(1);
19690 MVT VT = Src1.getSimpleValueType();
19691 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19692 SDValue Imm = Op.getOperand(2);
19693 SDValue Mask = Op.getOperand(3);
19694 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19695 Mask.getSimpleValueType().getSizeInBits());
19696 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19697 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19698 DAG.getTargetConstant(0, dl, MaskVT),
19700 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19701 DAG.getUNDEF(BitcastVT), FPclassMask,
19702 DAG.getIntPtrConstant(0, dl));
19703 return DAG.getBitcast(Op.getValueType(), Res);
19706 SDValue Src1 = Op.getOperand(1);
19707 SDValue Imm = Op.getOperand(2);
19708 SDValue Mask = Op.getOperand(3);
19709 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19710 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19711 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19712 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19713 DAG.getIntPtrConstant(0, dl));
19716 case CMP_MASK_CC: {
19717 // Comparison intrinsics with masks.
19718 // Example of transformation:
19719 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19720 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19722 // (v8i1 (insert_subvector undef,
19723 // (v2i1 (and (PCMPEQM %a, %b),
19724 // (extract_subvector
19725 // (v8i1 (bitcast %mask)), 0))), 0))))
19726 MVT VT = Op.getOperand(1).getSimpleValueType();
19727 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19728 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19729 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19730 Mask.getSimpleValueType().getSizeInBits());
19732 if (IntrData->Type == CMP_MASK_CC) {
19733 SDValue CC = Op.getOperand(3);
19734 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19735 // We specify 2 possible opcodes for intrinsics with rounding modes.
19736 // First, we check if the intrinsic may have non-default rounding mode,
19737 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19738 if (IntrData->Opc1 != 0) {
19739 SDValue Rnd = Op.getOperand(5);
19740 if (!isRoundModeCurDirection(Rnd))
19741 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19742 Op.getOperand(2), CC, Rnd);
19744 //default rounding mode
19746 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19747 Op.getOperand(2), CC);
19750 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19751 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19754 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19755 DAG.getTargetConstant(0, dl,
19758 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19759 DAG.getUNDEF(BitcastVT), CmpMask,
19760 DAG.getIntPtrConstant(0, dl));
19761 return DAG.getBitcast(Op.getValueType(), Res);
19763 case CMP_MASK_SCALAR_CC: {
19764 SDValue Src1 = Op.getOperand(1);
19765 SDValue Src2 = Op.getOperand(2);
19766 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19767 SDValue Mask = Op.getOperand(4);
19770 if (IntrData->Opc1 != 0) {
19771 SDValue Rnd = Op.getOperand(5);
19772 if (!isRoundModeCurDirection(Rnd))
19773 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19775 //default rounding mode
19777 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19779 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19780 DAG.getTargetConstant(0, dl,
19783 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19784 DAG.getIntPtrConstant(0, dl));
19786 case COMI: { // Comparison intrinsics
19787 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19788 SDValue LHS = Op.getOperand(1);
19789 SDValue RHS = Op.getOperand(2);
19790 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19791 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19794 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19795 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19796 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19797 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19800 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19801 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19802 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19803 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19806 case ISD::SETGT: // (CF = 0 and ZF = 0)
19807 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19809 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19810 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19813 case ISD::SETGE: // CF = 0
19814 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19816 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19817 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19820 llvm_unreachable("Unexpected illegal condition!");
19822 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19824 case COMI_RM: { // Comparison intrinsics with Sae
19825 SDValue LHS = Op.getOperand(1);
19826 SDValue RHS = Op.getOperand(2);
19827 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19828 SDValue Sae = Op.getOperand(4);
19831 if (isRoundModeCurDirection(Sae))
19832 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19833 DAG.getConstant(CondVal, dl, MVT::i8));
19835 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19836 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19837 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19838 DAG.getIntPtrConstant(0, dl));
19841 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19842 Op.getOperand(1), Op.getOperand(2), Subtarget,
19844 case COMPRESS_EXPAND_IN_REG: {
19845 SDValue Mask = Op.getOperand(3);
19846 SDValue DataToCompress = Op.getOperand(1);
19847 SDValue PassThru = Op.getOperand(2);
19848 if (isAllOnesConstant(Mask)) // return data as is
19849 return Op.getOperand(1);
19851 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19853 Mask, PassThru, Subtarget, DAG);
19856 SDValue Mask = Op.getOperand(1);
19857 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19858 Mask.getSimpleValueType().getSizeInBits());
19859 Mask = DAG.getBitcast(MaskVT, Mask);
19860 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19863 MVT VT = Op.getSimpleValueType();
19864 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19866 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19867 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19868 // Arguments should be swapped.
19869 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19870 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19872 return DAG.getBitcast(VT, Res);
19875 MVT VT = Op.getSimpleValueType();
19876 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19878 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19879 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19880 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19881 return DAG.getBitcast(VT, Res);
19884 case FIXUPIMMS_MASKZ:
19886 case FIXUPIMM_MASKZ:{
19887 SDValue Src1 = Op.getOperand(1);
19888 SDValue Src2 = Op.getOperand(2);
19889 SDValue Src3 = Op.getOperand(3);
19890 SDValue Imm = Op.getOperand(4);
19891 SDValue Mask = Op.getOperand(5);
19892 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19893 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19894 // We specify 2 possible modes for intrinsics, with/without rounding
19896 // First, we check if the intrinsic have rounding mode (7 operands),
19897 // if not, we set rounding mode to "current".
19899 if (Op.getNumOperands() == 7)
19900 Rnd = Op.getOperand(6);
19902 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19903 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19904 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19905 Src1, Src2, Src3, Imm, Rnd),
19906 Mask, Passthru, Subtarget, DAG);
19907 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19908 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19909 Src1, Src2, Src3, Imm, Rnd),
19910 Mask, Passthru, Subtarget, DAG);
19912 case CONVERT_TO_MASK: {
19913 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19914 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19915 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19917 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19919 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19920 DAG.getUNDEF(BitcastVT), CvtMask,
19921 DAG.getIntPtrConstant(0, dl));
19922 return DAG.getBitcast(Op.getValueType(), Res);
19924 case BRCST_SUBVEC_TO_VEC: {
19925 SDValue Src = Op.getOperand(1);
19926 SDValue Passthru = Op.getOperand(2);
19927 SDValue Mask = Op.getOperand(3);
19928 EVT resVT = Passthru.getValueType();
19929 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19930 DAG.getUNDEF(resVT), Src,
19931 DAG.getIntPtrConstant(0, dl));
19933 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19934 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19936 immVal = DAG.getConstant(0, dl, MVT::i8);
19937 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19938 subVec, subVec, immVal),
19939 Mask, Passthru, Subtarget, DAG);
19941 case BRCST32x2_TO_VEC: {
19942 SDValue Src = Op.getOperand(1);
19943 SDValue PassThru = Op.getOperand(2);
19944 SDValue Mask = Op.getOperand(3);
19946 assert((VT.getScalarType() == MVT::i32 ||
19947 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19948 //bitcast Src to packed 64
19949 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19950 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19951 Src = DAG.getBitcast(BitcastVT, Src);
19953 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19954 Mask, PassThru, Subtarget, DAG);
19962 default: return SDValue(); // Don't custom lower most intrinsics.
19964 case Intrinsic::x86_avx2_permd:
19965 case Intrinsic::x86_avx2_permps:
19966 // Operands intentionally swapped. Mask is last operand to intrinsic,
19967 // but second operand for node/instruction.
19968 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
19969 Op.getOperand(2), Op.getOperand(1));
19971 // ptest and testp intrinsics. The intrinsic these come from are designed to
19972 // return an integer value, not just an instruction so lower it to the ptest
19973 // or testp pattern and a setcc for the result.
19974 case Intrinsic::x86_sse41_ptestz:
19975 case Intrinsic::x86_sse41_ptestc:
19976 case Intrinsic::x86_sse41_ptestnzc:
19977 case Intrinsic::x86_avx_ptestz_256:
19978 case Intrinsic::x86_avx_ptestc_256:
19979 case Intrinsic::x86_avx_ptestnzc_256:
19980 case Intrinsic::x86_avx_vtestz_ps:
19981 case Intrinsic::x86_avx_vtestc_ps:
19982 case Intrinsic::x86_avx_vtestnzc_ps:
19983 case Intrinsic::x86_avx_vtestz_pd:
19984 case Intrinsic::x86_avx_vtestc_pd:
19985 case Intrinsic::x86_avx_vtestnzc_pd:
19986 case Intrinsic::x86_avx_vtestz_ps_256:
19987 case Intrinsic::x86_avx_vtestc_ps_256:
19988 case Intrinsic::x86_avx_vtestnzc_ps_256:
19989 case Intrinsic::x86_avx_vtestz_pd_256:
19990 case Intrinsic::x86_avx_vtestc_pd_256:
19991 case Intrinsic::x86_avx_vtestnzc_pd_256: {
19992 bool IsTestPacked = false;
19993 X86::CondCode X86CC;
19995 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
19996 case Intrinsic::x86_avx_vtestz_ps:
19997 case Intrinsic::x86_avx_vtestz_pd:
19998 case Intrinsic::x86_avx_vtestz_ps_256:
19999 case Intrinsic::x86_avx_vtestz_pd_256:
20000 IsTestPacked = true;
20002 case Intrinsic::x86_sse41_ptestz:
20003 case Intrinsic::x86_avx_ptestz_256:
20005 X86CC = X86::COND_E;
20007 case Intrinsic::x86_avx_vtestc_ps:
20008 case Intrinsic::x86_avx_vtestc_pd:
20009 case Intrinsic::x86_avx_vtestc_ps_256:
20010 case Intrinsic::x86_avx_vtestc_pd_256:
20011 IsTestPacked = true;
20013 case Intrinsic::x86_sse41_ptestc:
20014 case Intrinsic::x86_avx_ptestc_256:
20016 X86CC = X86::COND_B;
20018 case Intrinsic::x86_avx_vtestnzc_ps:
20019 case Intrinsic::x86_avx_vtestnzc_pd:
20020 case Intrinsic::x86_avx_vtestnzc_ps_256:
20021 case Intrinsic::x86_avx_vtestnzc_pd_256:
20022 IsTestPacked = true;
20024 case Intrinsic::x86_sse41_ptestnzc:
20025 case Intrinsic::x86_avx_ptestnzc_256:
20027 X86CC = X86::COND_A;
20031 SDValue LHS = Op.getOperand(1);
20032 SDValue RHS = Op.getOperand(2);
20033 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20034 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20035 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20036 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20038 case Intrinsic::x86_avx512_kortestz_w:
20039 case Intrinsic::x86_avx512_kortestc_w: {
20040 X86::CondCode X86CC =
20041 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20042 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20043 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20044 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20045 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20046 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20049 case Intrinsic::x86_avx512_knot_w: {
20050 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20051 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20052 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20053 return DAG.getBitcast(MVT::i16, Res);
20056 case Intrinsic::x86_avx512_kandn_w: {
20057 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20058 // Invert LHS for the not.
20059 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20060 DAG.getConstant(1, dl, MVT::v16i1));
20061 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20062 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20063 return DAG.getBitcast(MVT::i16, Res);
20066 case Intrinsic::x86_avx512_kxnor_w: {
20067 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20068 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20069 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20070 // Invert result for the not.
20071 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20072 DAG.getConstant(1, dl, MVT::v16i1));
20073 return DAG.getBitcast(MVT::i16, Res);
20076 case Intrinsic::x86_sse42_pcmpistria128:
20077 case Intrinsic::x86_sse42_pcmpestria128:
20078 case Intrinsic::x86_sse42_pcmpistric128:
20079 case Intrinsic::x86_sse42_pcmpestric128:
20080 case Intrinsic::x86_sse42_pcmpistrio128:
20081 case Intrinsic::x86_sse42_pcmpestrio128:
20082 case Intrinsic::x86_sse42_pcmpistris128:
20083 case Intrinsic::x86_sse42_pcmpestris128:
20084 case Intrinsic::x86_sse42_pcmpistriz128:
20085 case Intrinsic::x86_sse42_pcmpestriz128: {
20087 X86::CondCode X86CC;
20089 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20090 case Intrinsic::x86_sse42_pcmpistria128:
20091 Opcode = X86ISD::PCMPISTRI;
20092 X86CC = X86::COND_A;
20094 case Intrinsic::x86_sse42_pcmpestria128:
20095 Opcode = X86ISD::PCMPESTRI;
20096 X86CC = X86::COND_A;
20098 case Intrinsic::x86_sse42_pcmpistric128:
20099 Opcode = X86ISD::PCMPISTRI;
20100 X86CC = X86::COND_B;
20102 case Intrinsic::x86_sse42_pcmpestric128:
20103 Opcode = X86ISD::PCMPESTRI;
20104 X86CC = X86::COND_B;
20106 case Intrinsic::x86_sse42_pcmpistrio128:
20107 Opcode = X86ISD::PCMPISTRI;
20108 X86CC = X86::COND_O;
20110 case Intrinsic::x86_sse42_pcmpestrio128:
20111 Opcode = X86ISD::PCMPESTRI;
20112 X86CC = X86::COND_O;
20114 case Intrinsic::x86_sse42_pcmpistris128:
20115 Opcode = X86ISD::PCMPISTRI;
20116 X86CC = X86::COND_S;
20118 case Intrinsic::x86_sse42_pcmpestris128:
20119 Opcode = X86ISD::PCMPESTRI;
20120 X86CC = X86::COND_S;
20122 case Intrinsic::x86_sse42_pcmpistriz128:
20123 Opcode = X86ISD::PCMPISTRI;
20124 X86CC = X86::COND_E;
20126 case Intrinsic::x86_sse42_pcmpestriz128:
20127 Opcode = X86ISD::PCMPESTRI;
20128 X86CC = X86::COND_E;
20131 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20132 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20133 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20134 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20135 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20138 case Intrinsic::x86_sse42_pcmpistri128:
20139 case Intrinsic::x86_sse42_pcmpestri128: {
20141 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20142 Opcode = X86ISD::PCMPISTRI;
20144 Opcode = X86ISD::PCMPESTRI;
20146 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20147 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20148 return DAG.getNode(Opcode, dl, VTs, NewOps);
20151 case Intrinsic::eh_sjlj_lsda: {
20152 MachineFunction &MF = DAG.getMachineFunction();
20153 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20154 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20155 auto &Context = MF.getMMI().getContext();
20156 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20157 Twine(MF.getFunctionNumber()));
20158 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20161 case Intrinsic::x86_seh_lsda: {
20162 // Compute the symbol for the LSDA. We know it'll get emitted later.
20163 MachineFunction &MF = DAG.getMachineFunction();
20164 SDValue Op1 = Op.getOperand(1);
20165 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20166 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20167 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20169 // Generate a simple absolute symbol reference. This intrinsic is only
20170 // supported on 32-bit Windows, which isn't PIC.
20171 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20172 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20175 case Intrinsic::x86_seh_recoverfp: {
20176 SDValue FnOp = Op.getOperand(1);
20177 SDValue IncomingFPOp = Op.getOperand(2);
20178 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20179 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20181 report_fatal_error(
20182 "llvm.x86.seh.recoverfp must take a function as the first argument");
20183 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20186 case Intrinsic::localaddress: {
20187 // Returns one of the stack, base, or frame pointer registers, depending on
20188 // which is used to reference local variables.
20189 MachineFunction &MF = DAG.getMachineFunction();
20190 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20192 if (RegInfo->hasBasePointer(MF))
20193 Reg = RegInfo->getBaseRegister();
20194 else // This function handles the SP or FP case.
20195 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20196 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20201 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20202 SDValue Src, SDValue Mask, SDValue Base,
20203 SDValue Index, SDValue ScaleOp, SDValue Chain,
20204 const X86Subtarget &Subtarget) {
20206 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20207 // Scale must be constant.
20210 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20211 EVT MaskVT = Mask.getValueType();
20212 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20213 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20214 SDValue Segment = DAG.getRegister(0, MVT::i32);
20215 // If source is undef or we know it won't be used, use a zero vector
20216 // to break register dependency.
20217 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20218 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20219 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20220 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20221 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20222 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20223 return DAG.getMergeValues(RetOps, dl);
20226 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20227 SDValue Src, SDValue Mask, SDValue Base,
20228 SDValue Index, SDValue ScaleOp, SDValue Chain,
20229 const X86Subtarget &Subtarget) {
20231 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20232 // Scale must be constant.
20235 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20236 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20237 Index.getSimpleValueType().getVectorNumElements());
20239 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20240 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20241 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20242 SDValue Segment = DAG.getRegister(0, MVT::i32);
20243 // If source is undef or we know it won't be used, use a zero vector
20244 // to break register dependency.
20245 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20246 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20247 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20248 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20249 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20250 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20251 return DAG.getMergeValues(RetOps, dl);
20254 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20255 SDValue Src, SDValue Mask, SDValue Base,
20256 SDValue Index, SDValue ScaleOp, SDValue Chain,
20257 const X86Subtarget &Subtarget) {
20259 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20260 // Scale must be constant.
20263 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20264 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20265 SDValue Segment = DAG.getRegister(0, MVT::i32);
20266 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20267 Index.getSimpleValueType().getVectorNumElements());
20269 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20270 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20271 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20272 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20273 return SDValue(Res, 1);
20276 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20277 SDValue Mask, SDValue Base, SDValue Index,
20278 SDValue ScaleOp, SDValue Chain,
20279 const X86Subtarget &Subtarget) {
20281 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20282 // Scale must be constant.
20285 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20286 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20287 SDValue Segment = DAG.getRegister(0, MVT::i32);
20289 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20290 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20291 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20292 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20293 return SDValue(Res, 0);
20296 /// Handles the lowering of builtin intrinsic that return the value
20297 /// of the extended control register.
20298 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20300 const X86Subtarget &Subtarget,
20301 SmallVectorImpl<SDValue> &Results) {
20302 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20303 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20306 // The ECX register is used to select the index of the XCR register to
20309 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20310 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20311 Chain = SDValue(N1, 0);
20313 // Reads the content of XCR and returns it in registers EDX:EAX.
20314 if (Subtarget.is64Bit()) {
20315 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20316 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20319 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20320 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20323 Chain = HI.getValue(1);
20325 if (Subtarget.is64Bit()) {
20326 // Merge the two 32-bit values into a 64-bit one..
20327 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20328 DAG.getConstant(32, DL, MVT::i8));
20329 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20330 Results.push_back(Chain);
20334 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20335 SDValue Ops[] = { LO, HI };
20336 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20337 Results.push_back(Pair);
20338 Results.push_back(Chain);
20341 /// Handles the lowering of builtin intrinsics that read performance monitor
20342 /// counters (x86_rdpmc).
20343 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20345 const X86Subtarget &Subtarget,
20346 SmallVectorImpl<SDValue> &Results) {
20347 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20348 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20351 // The ECX register is used to select the index of the performance counter
20353 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20355 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20357 // Reads the content of a 64-bit performance counter and returns it in the
20358 // registers EDX:EAX.
20359 if (Subtarget.is64Bit()) {
20360 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20361 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20364 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20365 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20368 Chain = HI.getValue(1);
20370 if (Subtarget.is64Bit()) {
20371 // The EAX register is loaded with the low-order 32 bits. The EDX register
20372 // is loaded with the supported high-order bits of the counter.
20373 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20374 DAG.getConstant(32, DL, MVT::i8));
20375 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20376 Results.push_back(Chain);
20380 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20381 SDValue Ops[] = { LO, HI };
20382 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20383 Results.push_back(Pair);
20384 Results.push_back(Chain);
20387 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20388 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20389 /// READCYCLECOUNTER nodes.
20390 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20392 const X86Subtarget &Subtarget,
20393 SmallVectorImpl<SDValue> &Results) {
20394 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20395 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20398 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20399 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20400 // and the EAX register is loaded with the low-order 32 bits.
20401 if (Subtarget.is64Bit()) {
20402 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20403 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20406 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20407 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20410 SDValue Chain = HI.getValue(1);
20412 if (Opcode == X86ISD::RDTSCP_DAG) {
20413 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20415 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20416 // the ECX register. Add 'ecx' explicitly to the chain.
20417 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20419 // Explicitly store the content of ECX at the location passed in input
20420 // to the 'rdtscp' intrinsic.
20421 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20422 MachinePointerInfo());
20425 if (Subtarget.is64Bit()) {
20426 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20427 // the EAX register is loaded with the low-order 32 bits.
20428 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20429 DAG.getConstant(32, DL, MVT::i8));
20430 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20431 Results.push_back(Chain);
20435 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20436 SDValue Ops[] = { LO, HI };
20437 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20438 Results.push_back(Pair);
20439 Results.push_back(Chain);
20442 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20443 SelectionDAG &DAG) {
20444 SmallVector<SDValue, 2> Results;
20446 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20448 return DAG.getMergeValues(Results, DL);
20451 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20452 MachineFunction &MF = DAG.getMachineFunction();
20453 SDValue Chain = Op.getOperand(0);
20454 SDValue RegNode = Op.getOperand(2);
20455 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20457 report_fatal_error("EH registrations only live in functions using WinEH");
20459 // Cast the operand to an alloca, and remember the frame index.
20460 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20462 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20463 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20465 // Return the chain operand without making any DAG nodes.
20469 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20470 MachineFunction &MF = DAG.getMachineFunction();
20471 SDValue Chain = Op.getOperand(0);
20472 SDValue EHGuard = Op.getOperand(2);
20473 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20475 report_fatal_error("EHGuard only live in functions using WinEH");
20477 // Cast the operand to an alloca, and remember the frame index.
20478 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20480 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20481 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20483 // Return the chain operand without making any DAG nodes.
20487 /// Emit Truncating Store with signed or unsigned saturation.
20489 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20490 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20491 SelectionDAG &DAG) {
20493 SDVTList VTs = DAG.getVTList(MVT::Other);
20494 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20495 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20497 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20498 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20501 /// Emit Masked Truncating Store with signed or unsigned saturation.
20503 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20504 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20505 MachineMemOperand *MMO, SelectionDAG &DAG) {
20507 SDVTList VTs = DAG.getVTList(MVT::Other);
20508 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20510 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20511 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20514 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20515 SelectionDAG &DAG) {
20516 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20518 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20521 case llvm::Intrinsic::x86_seh_ehregnode:
20522 return MarkEHRegistrationNode(Op, DAG);
20523 case llvm::Intrinsic::x86_seh_ehguard:
20524 return MarkEHGuard(Op, DAG);
20525 case llvm::Intrinsic::x86_flags_read_u32:
20526 case llvm::Intrinsic::x86_flags_read_u64:
20527 case llvm::Intrinsic::x86_flags_write_u32:
20528 case llvm::Intrinsic::x86_flags_write_u64: {
20529 // We need a frame pointer because this will get lowered to a PUSH/POP
20531 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20532 MFI.setHasCopyImplyingStackAdjustment(true);
20533 // Don't do anything here, we will expand these intrinsics out later
20534 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20537 case Intrinsic::x86_lwpins32:
20538 case Intrinsic::x86_lwpins64: {
20540 SDValue Chain = Op->getOperand(0);
20541 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20543 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20544 Op->getOperand(3), Op->getOperand(4));
20545 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20546 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20547 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20548 LwpIns.getValue(1));
20555 switch(IntrData->Type) {
20556 default: llvm_unreachable("Unknown Intrinsic Type");
20559 // Emit the node with the right value type.
20560 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20561 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20563 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20564 // Otherwise return the value from Rand, which is always 0, casted to i32.
20565 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20566 DAG.getConstant(1, dl, Op->getValueType(1)),
20567 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20568 SDValue(Result.getNode(), 1) };
20569 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20570 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20573 // Return { result, isValid, chain }.
20574 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20575 SDValue(Result.getNode(), 2));
20577 case GATHER_AVX2: {
20578 SDValue Chain = Op.getOperand(0);
20579 SDValue Src = Op.getOperand(2);
20580 SDValue Base = Op.getOperand(3);
20581 SDValue Index = Op.getOperand(4);
20582 SDValue Mask = Op.getOperand(5);
20583 SDValue Scale = Op.getOperand(6);
20584 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20585 Scale, Chain, Subtarget);
20588 //gather(v1, mask, index, base, scale);
20589 SDValue Chain = Op.getOperand(0);
20590 SDValue Src = Op.getOperand(2);
20591 SDValue Base = Op.getOperand(3);
20592 SDValue Index = Op.getOperand(4);
20593 SDValue Mask = Op.getOperand(5);
20594 SDValue Scale = Op.getOperand(6);
20595 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20599 //scatter(base, mask, index, v1, scale);
20600 SDValue Chain = Op.getOperand(0);
20601 SDValue Base = Op.getOperand(2);
20602 SDValue Mask = Op.getOperand(3);
20603 SDValue Index = Op.getOperand(4);
20604 SDValue Src = Op.getOperand(5);
20605 SDValue Scale = Op.getOperand(6);
20606 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20607 Scale, Chain, Subtarget);
20610 SDValue Hint = Op.getOperand(6);
20611 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20612 assert((HintVal == 2 || HintVal == 3) &&
20613 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20614 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20615 SDValue Chain = Op.getOperand(0);
20616 SDValue Mask = Op.getOperand(2);
20617 SDValue Index = Op.getOperand(3);
20618 SDValue Base = Op.getOperand(4);
20619 SDValue Scale = Op.getOperand(5);
20620 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20623 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20625 SmallVector<SDValue, 2> Results;
20626 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20628 return DAG.getMergeValues(Results, dl);
20630 // Read Performance Monitoring Counters.
20632 SmallVector<SDValue, 2> Results;
20633 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20634 return DAG.getMergeValues(Results, dl);
20636 // Get Extended Control Register.
20638 SmallVector<SDValue, 2> Results;
20639 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20640 return DAG.getMergeValues(Results, dl);
20642 // XTEST intrinsics.
20644 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20645 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20647 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20648 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20649 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20650 Ret, SDValue(InTrans.getNode(), 1));
20654 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20655 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::Other);
20656 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20657 DAG.getConstant(-1, dl, MVT::i8));
20658 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20659 Op.getOperand(4), GenCF.getValue(1));
20660 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20661 Op.getOperand(5), MachinePointerInfo());
20662 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20663 SDValue Results[] = { SetCC, Store };
20664 return DAG.getMergeValues(Results, dl);
20666 case COMPRESS_TO_MEM: {
20667 SDValue Mask = Op.getOperand(4);
20668 SDValue DataToCompress = Op.getOperand(3);
20669 SDValue Addr = Op.getOperand(2);
20670 SDValue Chain = Op.getOperand(0);
20671 MVT VT = DataToCompress.getSimpleValueType();
20673 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20674 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20676 if (isAllOnesConstant(Mask)) // return just a store
20677 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20678 MemIntr->getMemOperand());
20680 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20681 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20683 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20684 MemIntr->getMemOperand(),
20685 false /* truncating */, true /* compressing */);
20687 case TRUNCATE_TO_MEM_VI8:
20688 case TRUNCATE_TO_MEM_VI16:
20689 case TRUNCATE_TO_MEM_VI32: {
20690 SDValue Mask = Op.getOperand(4);
20691 SDValue DataToTruncate = Op.getOperand(3);
20692 SDValue Addr = Op.getOperand(2);
20693 SDValue Chain = Op.getOperand(0);
20695 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20696 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20698 EVT MemVT = MemIntr->getMemoryVT();
20700 uint16_t TruncationOp = IntrData->Opc0;
20701 switch (TruncationOp) {
20702 case X86ISD::VTRUNC: {
20703 if (isAllOnesConstant(Mask)) // return just a truncate store
20704 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20705 MemIntr->getMemOperand());
20707 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20708 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20710 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20711 MemIntr->getMemOperand(), true /* truncating */);
20713 case X86ISD::VTRUNCUS:
20714 case X86ISD::VTRUNCS: {
20715 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20716 if (isAllOnesConstant(Mask))
20717 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20718 MemIntr->getMemOperand(), DAG);
20720 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20721 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20723 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20724 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20727 llvm_unreachable("Unsupported truncstore intrinsic");
20731 case EXPAND_FROM_MEM: {
20732 SDValue Mask = Op.getOperand(4);
20733 SDValue PassThru = Op.getOperand(3);
20734 SDValue Addr = Op.getOperand(2);
20735 SDValue Chain = Op.getOperand(0);
20736 MVT VT = Op.getSimpleValueType();
20738 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20739 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20741 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20742 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20743 if (X86::isZeroNode(Mask))
20744 return DAG.getUNDEF(VT);
20746 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20747 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20748 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20749 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20750 true /* expanding */);
20755 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20756 SelectionDAG &DAG) const {
20757 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20758 MFI.setReturnAddressIsTaken(true);
20760 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20763 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20765 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20768 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20769 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20770 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20771 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20772 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20773 MachinePointerInfo());
20776 // Just load the return address.
20777 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20778 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20779 MachinePointerInfo());
20782 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20783 SelectionDAG &DAG) const {
20784 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20785 return getReturnAddressFrameIndex(DAG);
20788 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20789 MachineFunction &MF = DAG.getMachineFunction();
20790 MachineFrameInfo &MFI = MF.getFrameInfo();
20791 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20792 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20793 EVT VT = Op.getValueType();
20795 MFI.setFrameAddressIsTaken(true);
20797 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20798 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20799 // is not possible to crawl up the stack without looking at the unwind codes
20801 int FrameAddrIndex = FuncInfo->getFAIndex();
20802 if (!FrameAddrIndex) {
20803 // Set up a frame object for the return address.
20804 unsigned SlotSize = RegInfo->getSlotSize();
20805 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20806 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20807 FuncInfo->setFAIndex(FrameAddrIndex);
20809 return DAG.getFrameIndex(FrameAddrIndex, VT);
20812 unsigned FrameReg =
20813 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20814 SDLoc dl(Op); // FIXME probably not meaningful
20815 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20816 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20817 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20818 "Invalid Frame Register!");
20819 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20821 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20822 MachinePointerInfo());
20826 // FIXME? Maybe this could be a TableGen attribute on some registers and
20827 // this table could be generated automatically from RegInfo.
20828 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20829 SelectionDAG &DAG) const {
20830 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20831 const MachineFunction &MF = DAG.getMachineFunction();
20833 unsigned Reg = StringSwitch<unsigned>(RegName)
20834 .Case("esp", X86::ESP)
20835 .Case("rsp", X86::RSP)
20836 .Case("ebp", X86::EBP)
20837 .Case("rbp", X86::RBP)
20840 if (Reg == X86::EBP || Reg == X86::RBP) {
20841 if (!TFI.hasFP(MF))
20842 report_fatal_error("register " + StringRef(RegName) +
20843 " is allocatable: function has no frame pointer");
20846 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20847 unsigned FrameReg =
20848 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20849 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20850 "Invalid Frame Register!");
20858 report_fatal_error("Invalid register name global variable");
20861 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20862 SelectionDAG &DAG) const {
20863 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20864 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20867 unsigned X86TargetLowering::getExceptionPointerRegister(
20868 const Constant *PersonalityFn) const {
20869 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20870 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20872 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20875 unsigned X86TargetLowering::getExceptionSelectorRegister(
20876 const Constant *PersonalityFn) const {
20877 // Funclet personalities don't use selectors (the runtime does the selection).
20878 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20879 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20882 bool X86TargetLowering::needsFixedCatchObjects() const {
20883 return Subtarget.isTargetWin64();
20886 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20887 SDValue Chain = Op.getOperand(0);
20888 SDValue Offset = Op.getOperand(1);
20889 SDValue Handler = Op.getOperand(2);
20892 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20893 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20894 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20895 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20896 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20897 "Invalid Frame Register!");
20898 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20899 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20901 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20902 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20904 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20905 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20906 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20908 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20909 DAG.getRegister(StoreAddrReg, PtrVT));
20912 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20913 SelectionDAG &DAG) const {
20915 // If the subtarget is not 64bit, we may need the global base reg
20916 // after isel expand pseudo, i.e., after CGBR pass ran.
20917 // Therefore, ask for the GlobalBaseReg now, so that the pass
20918 // inserts the code for us in case we need it.
20919 // Otherwise, we will end up in a situation where we will
20920 // reference a virtual register that is not defined!
20921 if (!Subtarget.is64Bit()) {
20922 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20923 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20925 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20926 DAG.getVTList(MVT::i32, MVT::Other),
20927 Op.getOperand(0), Op.getOperand(1));
20930 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20931 SelectionDAG &DAG) const {
20933 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20934 Op.getOperand(0), Op.getOperand(1));
20937 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20938 SelectionDAG &DAG) const {
20940 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20944 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20945 return Op.getOperand(0);
20948 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20949 SelectionDAG &DAG) const {
20950 SDValue Root = Op.getOperand(0);
20951 SDValue Trmp = Op.getOperand(1); // trampoline
20952 SDValue FPtr = Op.getOperand(2); // nested function
20953 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20956 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20957 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20959 if (Subtarget.is64Bit()) {
20960 SDValue OutChains[6];
20962 // Large code-model.
20963 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20964 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20966 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20967 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
20969 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
20971 // Load the pointer to the nested function into R11.
20972 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
20973 SDValue Addr = Trmp;
20974 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20975 Addr, MachinePointerInfo(TrmpAddr));
20977 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20978 DAG.getConstant(2, dl, MVT::i64));
20980 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
20981 /* Alignment = */ 2);
20983 // Load the 'nest' parameter value into R10.
20984 // R10 is specified in X86CallingConv.td
20985 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
20986 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20987 DAG.getConstant(10, dl, MVT::i64));
20988 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
20989 Addr, MachinePointerInfo(TrmpAddr, 10));
20991 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
20992 DAG.getConstant(12, dl, MVT::i64));
20994 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
20995 /* Alignment = */ 2);
20997 // Jump to the nested function.
20998 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
20999 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21000 DAG.getConstant(20, dl, MVT::i64));
21001 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21002 Addr, MachinePointerInfo(TrmpAddr, 20));
21004 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21005 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21006 DAG.getConstant(22, dl, MVT::i64));
21007 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21008 Addr, MachinePointerInfo(TrmpAddr, 22));
21010 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21012 const Function *Func =
21013 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21014 CallingConv::ID CC = Func->getCallingConv();
21019 llvm_unreachable("Unsupported calling convention");
21020 case CallingConv::C:
21021 case CallingConv::X86_StdCall: {
21022 // Pass 'nest' parameter in ECX.
21023 // Must be kept in sync with X86CallingConv.td
21024 NestReg = X86::ECX;
21026 // Check that ECX wasn't needed by an 'inreg' parameter.
21027 FunctionType *FTy = Func->getFunctionType();
21028 const AttributeList &Attrs = Func->getAttributes();
21030 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21031 unsigned InRegCount = 0;
21034 for (FunctionType::param_iterator I = FTy->param_begin(),
21035 E = FTy->param_end(); I != E; ++I, ++Idx)
21036 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21037 auto &DL = DAG.getDataLayout();
21038 // FIXME: should only count parameters that are lowered to integers.
21039 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21042 if (InRegCount > 2) {
21043 report_fatal_error("Nest register in use - reduce number of inreg"
21049 case CallingConv::X86_FastCall:
21050 case CallingConv::X86_ThisCall:
21051 case CallingConv::Fast:
21052 // Pass 'nest' parameter in EAX.
21053 // Must be kept in sync with X86CallingConv.td
21054 NestReg = X86::EAX;
21058 SDValue OutChains[4];
21059 SDValue Addr, Disp;
21061 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21062 DAG.getConstant(10, dl, MVT::i32));
21063 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21065 // This is storing the opcode for MOV32ri.
21066 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21067 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21069 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21070 Trmp, MachinePointerInfo(TrmpAddr));
21072 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21073 DAG.getConstant(1, dl, MVT::i32));
21075 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21076 /* Alignment = */ 1);
21078 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21079 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21080 DAG.getConstant(5, dl, MVT::i32));
21081 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21082 Addr, MachinePointerInfo(TrmpAddr, 5),
21083 /* Alignment = */ 1);
21085 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21086 DAG.getConstant(6, dl, MVT::i32));
21088 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21089 /* Alignment = */ 1);
21091 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21095 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21096 SelectionDAG &DAG) const {
21098 The rounding mode is in bits 11:10 of FPSR, and has the following
21100 00 Round to nearest
21105 FLT_ROUNDS, on the other hand, expects the following:
21112 To perform the conversion, we do:
21113 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21116 MachineFunction &MF = DAG.getMachineFunction();
21117 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21118 unsigned StackAlignment = TFI.getStackAlignment();
21119 MVT VT = Op.getSimpleValueType();
21122 // Save FP Control Word to stack slot
21123 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21124 SDValue StackSlot =
21125 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21127 MachineMemOperand *MMO =
21128 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21129 MachineMemOperand::MOStore, 2, 2);
21131 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21132 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21133 DAG.getVTList(MVT::Other),
21134 Ops, MVT::i16, MMO);
21136 // Load FP Control Word from stack slot
21138 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21140 // Transform as necessary
21142 DAG.getNode(ISD::SRL, DL, MVT::i16,
21143 DAG.getNode(ISD::AND, DL, MVT::i16,
21144 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21145 DAG.getConstant(11, DL, MVT::i8));
21147 DAG.getNode(ISD::SRL, DL, MVT::i16,
21148 DAG.getNode(ISD::AND, DL, MVT::i16,
21149 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21150 DAG.getConstant(9, DL, MVT::i8));
21153 DAG.getNode(ISD::AND, DL, MVT::i16,
21154 DAG.getNode(ISD::ADD, DL, MVT::i16,
21155 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21156 DAG.getConstant(1, DL, MVT::i16)),
21157 DAG.getConstant(3, DL, MVT::i16));
21159 return DAG.getNode((VT.getSizeInBits() < 16 ?
21160 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21163 // Split an unary integer op into 2 half sized ops.
21164 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21165 MVT VT = Op.getSimpleValueType();
21166 unsigned NumElems = VT.getVectorNumElements();
21167 unsigned SizeInBits = VT.getSizeInBits();
21169 // Extract the Lo/Hi vectors
21171 SDValue Src = Op.getOperand(0);
21172 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21173 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21175 MVT EltVT = VT.getVectorElementType();
21176 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21177 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21178 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21179 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21182 // Decompose 256-bit ops into smaller 128-bit ops.
21183 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21184 assert(Op.getSimpleValueType().is256BitVector() &&
21185 Op.getSimpleValueType().isInteger() &&
21186 "Only handle AVX 256-bit vector integer operation");
21187 return LowerVectorIntUnary(Op, DAG);
21190 // Decompose 512-bit ops into smaller 256-bit ops.
21191 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21192 assert(Op.getSimpleValueType().is512BitVector() &&
21193 Op.getSimpleValueType().isInteger() &&
21194 "Only handle AVX 512-bit vector integer operation");
21195 return LowerVectorIntUnary(Op, DAG);
21198 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21200 // i8/i16 vector implemented using dword LZCNT vector instruction
21201 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21202 // split the vector, perform operation on it's Lo a Hi part and
21203 // concatenate the results.
21204 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21205 assert(Op.getOpcode() == ISD::CTLZ);
21207 MVT VT = Op.getSimpleValueType();
21208 MVT EltVT = VT.getVectorElementType();
21209 unsigned NumElems = VT.getVectorNumElements();
21211 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21212 "Unsupported element type");
21214 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21216 return LowerVectorIntUnary(Op, DAG);
21218 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21219 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21220 "Unsupported value type for operation");
21222 // Use native supported vector instruction vplzcntd.
21223 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21224 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21225 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21226 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21228 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21231 // Lower CTLZ using a PSHUFB lookup table implementation.
21232 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21233 const X86Subtarget &Subtarget,
21234 SelectionDAG &DAG) {
21235 MVT VT = Op.getSimpleValueType();
21236 int NumElts = VT.getVectorNumElements();
21237 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21238 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21240 // Per-nibble leading zero PSHUFB lookup table.
21241 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21242 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21243 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21244 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21246 SmallVector<SDValue, 64> LUTVec;
21247 for (int i = 0; i < NumBytes; ++i)
21248 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21249 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21251 // Begin by bitcasting the input to byte vector, then split those bytes
21252 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21253 // If the hi input nibble is zero then we add both results together, otherwise
21254 // we just take the hi result (by masking the lo result to zero before the
21256 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21257 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21259 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21260 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21261 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21262 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21263 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21265 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21266 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21267 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21268 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21270 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21271 // of the current vector width in the same way we did for the nibbles.
21272 // If the upper half of the input element is zero then add the halves'
21273 // leading zero counts together, otherwise just use the upper half's.
21274 // Double the width of the result until we are at target width.
21275 while (CurrVT != VT) {
21276 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21277 int CurrNumElts = CurrVT.getVectorNumElements();
21278 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21279 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21280 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21282 // Check if the upper half of the input element is zero.
21283 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21284 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21285 HiZ = DAG.getBitcast(NextVT, HiZ);
21287 // Move the upper/lower halves to the lower bits as we'll be extending to
21288 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21290 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21291 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21292 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21293 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21294 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21301 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21302 const X86Subtarget &Subtarget,
21303 SelectionDAG &DAG) {
21304 MVT VT = Op.getSimpleValueType();
21306 if (Subtarget.hasCDI())
21307 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21309 // Decompose 256-bit ops into smaller 128-bit ops.
21310 if (VT.is256BitVector() && !Subtarget.hasInt256())
21311 return Lower256IntUnary(Op, DAG);
21313 // Decompose 512-bit ops into smaller 256-bit ops.
21314 if (VT.is512BitVector() && !Subtarget.hasBWI())
21315 return Lower512IntUnary(Op, DAG);
21317 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21318 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21321 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21322 SelectionDAG &DAG) {
21323 MVT VT = Op.getSimpleValueType();
21325 unsigned NumBits = VT.getSizeInBits();
21327 unsigned Opc = Op.getOpcode();
21330 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21332 Op = Op.getOperand(0);
21333 if (VT == MVT::i8) {
21334 // Zero extend to i32 since there is not an i8 bsr.
21336 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21339 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21340 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21341 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21343 if (Opc == ISD::CTLZ) {
21344 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21347 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21348 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21351 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21354 // Finally xor with NumBits-1.
21355 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21356 DAG.getConstant(NumBits - 1, dl, OpVT));
21359 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21363 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21364 MVT VT = Op.getSimpleValueType();
21365 unsigned NumBits = VT.getScalarSizeInBits();
21368 if (VT.isVector()) {
21369 SDValue N0 = Op.getOperand(0);
21370 SDValue Zero = DAG.getConstant(0, dl, VT);
21372 // lsb(x) = (x & -x)
21373 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21374 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21376 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21377 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21378 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21379 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21380 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21383 // cttz(x) = ctpop(lsb - 1)
21384 SDValue One = DAG.getConstant(1, dl, VT);
21385 return DAG.getNode(ISD::CTPOP, dl, VT,
21386 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21389 assert(Op.getOpcode() == ISD::CTTZ &&
21390 "Only scalar CTTZ requires custom lowering");
21392 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21393 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21394 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21396 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21399 DAG.getConstant(NumBits, dl, VT),
21400 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21403 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21406 /// Break a 256-bit integer operation into two new 128-bit ones and then
21407 /// concatenate the result back.
21408 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21409 MVT VT = Op.getSimpleValueType();
21411 assert(VT.is256BitVector() && VT.isInteger() &&
21412 "Unsupported value type for operation");
21414 unsigned NumElems = VT.getVectorNumElements();
21417 // Extract the LHS vectors
21418 SDValue LHS = Op.getOperand(0);
21419 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21420 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21422 // Extract the RHS vectors
21423 SDValue RHS = Op.getOperand(1);
21424 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21425 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21427 MVT EltVT = VT.getVectorElementType();
21428 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21430 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21431 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21432 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21435 /// Break a 512-bit integer operation into two new 256-bit ones and then
21436 /// concatenate the result back.
21437 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21438 MVT VT = Op.getSimpleValueType();
21440 assert(VT.is512BitVector() && VT.isInteger() &&
21441 "Unsupported value type for operation");
21443 unsigned NumElems = VT.getVectorNumElements();
21446 // Extract the LHS vectors
21447 SDValue LHS = Op.getOperand(0);
21448 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21449 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21451 // Extract the RHS vectors
21452 SDValue RHS = Op.getOperand(1);
21453 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21454 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21456 MVT EltVT = VT.getVectorElementType();
21457 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21459 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21460 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21461 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21464 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21465 MVT VT = Op.getSimpleValueType();
21466 if (VT.getScalarType() == MVT::i1)
21467 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21468 Op.getOperand(0), Op.getOperand(1));
21469 assert(Op.getSimpleValueType().is256BitVector() &&
21470 Op.getSimpleValueType().isInteger() &&
21471 "Only handle AVX 256-bit vector integer operation");
21472 return Lower256IntArith(Op, DAG);
21475 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21476 assert(Op.getSimpleValueType().is256BitVector() &&
21477 Op.getSimpleValueType().isInteger() &&
21478 "Only handle AVX 256-bit vector integer operation");
21479 return Lower256IntUnary(Op, DAG);
21482 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21483 assert(Op.getSimpleValueType().is256BitVector() &&
21484 Op.getSimpleValueType().isInteger() &&
21485 "Only handle AVX 256-bit vector integer operation");
21486 return Lower256IntArith(Op, DAG);
21489 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21490 SelectionDAG &DAG) {
21492 MVT VT = Op.getSimpleValueType();
21494 if (VT.getScalarType() == MVT::i1)
21495 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21497 // Decompose 256-bit ops into smaller 128-bit ops.
21498 if (VT.is256BitVector() && !Subtarget.hasInt256())
21499 return Lower256IntArith(Op, DAG);
21501 SDValue A = Op.getOperand(0);
21502 SDValue B = Op.getOperand(1);
21504 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21505 // vector pairs, multiply and truncate.
21506 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21507 if (Subtarget.hasInt256()) {
21508 // For 512-bit vectors, split into 256-bit vectors to allow the
21509 // sign-extension to occur.
21510 if (VT == MVT::v64i8)
21511 return Lower512IntArith(Op, DAG);
21513 // For 256-bit vectors, split into 128-bit vectors to allow the
21514 // sign-extension to occur. We don't need this on AVX512BW as we can
21515 // safely sign-extend to v32i16.
21516 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21517 return Lower256IntArith(Op, DAG);
21519 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21520 return DAG.getNode(
21521 ISD::TRUNCATE, dl, VT,
21522 DAG.getNode(ISD::MUL, dl, ExVT,
21523 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21524 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21527 assert(VT == MVT::v16i8 &&
21528 "Pre-AVX2 support only supports v16i8 multiplication");
21529 MVT ExVT = MVT::v8i16;
21531 // Extract the lo parts and sign extend to i16
21533 if (Subtarget.hasSSE41()) {
21534 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21535 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21537 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21538 -1, 4, -1, 5, -1, 6, -1, 7};
21539 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21540 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21541 ALo = DAG.getBitcast(ExVT, ALo);
21542 BLo = DAG.getBitcast(ExVT, BLo);
21543 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21544 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21547 // Extract the hi parts and sign extend to i16
21549 if (Subtarget.hasSSE41()) {
21550 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21551 -1, -1, -1, -1, -1, -1, -1, -1};
21552 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21553 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21554 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21555 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21557 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21558 -1, 12, -1, 13, -1, 14, -1, 15};
21559 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21560 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21561 AHi = DAG.getBitcast(ExVT, AHi);
21562 BHi = DAG.getBitcast(ExVT, BHi);
21563 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21564 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21567 // Multiply, mask the lower 8bits of the lo/hi results and pack
21568 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21569 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21570 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21571 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21572 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21575 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21576 if (VT == MVT::v4i32) {
21577 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21578 "Should not custom lower when pmuldq is available!");
21580 // Extract the odd parts.
21581 static const int UnpackMask[] = { 1, -1, 3, -1 };
21582 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21583 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21585 // Multiply the even parts.
21586 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21587 // Now multiply odd parts.
21588 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21590 Evens = DAG.getBitcast(VT, Evens);
21591 Odds = DAG.getBitcast(VT, Odds);
21593 // Merge the two vectors back together with a shuffle. This expands into 2
21595 static const int ShufMask[] = { 0, 4, 2, 6 };
21596 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21599 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21600 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21602 // 32-bit vector types used for MULDQ/MULUDQ.
21603 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21605 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21606 // 32-bits. We can lower with this if the sign bits stretch that far.
21607 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21608 DAG.ComputeNumSignBits(B) > 32) {
21609 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21610 DAG.getBitcast(MulVT, B));
21613 // Ahi = psrlqi(a, 32);
21614 // Bhi = psrlqi(b, 32);
21616 // AloBlo = pmuludq(a, b);
21617 // AloBhi = pmuludq(a, Bhi);
21618 // AhiBlo = pmuludq(Ahi, b);
21620 // Hi = psllqi(AloBhi + AhiBlo, 32);
21621 // return AloBlo + Hi;
21622 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21623 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21624 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21626 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21627 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21628 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21630 // Bit cast to 32-bit vectors for MULUDQ.
21631 SDValue Alo = DAG.getBitcast(MulVT, A);
21632 SDValue Blo = DAG.getBitcast(MulVT, B);
21634 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21636 // Only multiply lo/hi halves that aren't known to be zero.
21637 SDValue AloBlo = Zero;
21638 if (!ALoIsZero && !BLoIsZero)
21639 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21641 SDValue AloBhi = Zero;
21642 if (!ALoIsZero && !BHiIsZero) {
21643 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21644 Bhi = DAG.getBitcast(MulVT, Bhi);
21645 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21648 SDValue AhiBlo = Zero;
21649 if (!AHiIsZero && !BLoIsZero) {
21650 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21651 Ahi = DAG.getBitcast(MulVT, Ahi);
21652 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21655 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21656 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21658 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21661 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21662 SelectionDAG &DAG) {
21664 MVT VT = Op.getSimpleValueType();
21666 // Decompose 256-bit ops into smaller 128-bit ops.
21667 if (VT.is256BitVector() && !Subtarget.hasInt256())
21668 return Lower256IntArith(Op, DAG);
21670 // Only i8 vectors should need custom lowering after this.
21671 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21672 "Unsupported vector type");
21674 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21675 // logical shift down the upper half and pack back to i8.
21676 SDValue A = Op.getOperand(0);
21677 SDValue B = Op.getOperand(1);
21679 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21680 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21681 unsigned Opcode = Op.getOpcode();
21682 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21683 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21685 // AVX2 implementations - extend xmm subvectors to ymm.
21686 if (Subtarget.hasInt256()) {
21687 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21688 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21690 if (VT == MVT::v32i8) {
21691 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21692 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21693 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21694 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21695 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21696 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21697 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21698 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21699 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21700 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21701 DAG.getConstant(8, dl, MVT::v16i16));
21702 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21703 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21704 DAG.getConstant(8, dl, MVT::v16i16));
21705 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21706 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21707 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21708 16, 17, 18, 19, 20, 21, 22, 23};
21709 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21710 24, 25, 26, 27, 28, 29, 30, 31};
21711 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21712 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21713 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21716 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21717 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21718 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21719 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21720 DAG.getConstant(8, dl, MVT::v16i16));
21721 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21722 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21723 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21726 assert(VT == MVT::v16i8 &&
21727 "Pre-AVX2 support only supports v16i8 multiplication");
21728 MVT ExVT = MVT::v8i16;
21730 // Extract the lo parts and zero/sign extend to i16.
21732 if (Subtarget.hasSSE41()) {
21733 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21734 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21736 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21737 -1, 4, -1, 5, -1, 6, -1, 7};
21738 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21739 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21740 ALo = DAG.getBitcast(ExVT, ALo);
21741 BLo = DAG.getBitcast(ExVT, BLo);
21742 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21743 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21746 // Extract the hi parts and zero/sign extend to i16.
21748 if (Subtarget.hasSSE41()) {
21749 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21750 -1, -1, -1, -1, -1, -1, -1, -1};
21751 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21752 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21753 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21754 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21756 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21757 -1, 12, -1, 13, -1, 14, -1, 15};
21758 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21759 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21760 AHi = DAG.getBitcast(ExVT, AHi);
21761 BHi = DAG.getBitcast(ExVT, BHi);
21762 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21763 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21766 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21767 // pack back to v16i8.
21768 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21769 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21770 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21771 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21772 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21775 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21776 assert(Subtarget.isTargetWin64() && "Unexpected target");
21777 EVT VT = Op.getValueType();
21778 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21779 "Unexpected return type for lowering");
21783 switch (Op->getOpcode()) {
21784 default: llvm_unreachable("Unexpected request for libcall!");
21785 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21786 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21787 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21788 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21789 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21790 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21794 SDValue InChain = DAG.getEntryNode();
21796 TargetLowering::ArgListTy Args;
21797 TargetLowering::ArgListEntry Entry;
21798 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21799 EVT ArgVT = Op->getOperand(i).getValueType();
21800 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21801 "Unexpected argument type for lowering");
21802 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21803 Entry.Node = StackPtr;
21804 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21805 MachinePointerInfo(), /* Alignment = */ 16);
21806 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21807 Entry.Ty = PointerType::get(ArgTy,0);
21808 Entry.IsSExt = false;
21809 Entry.IsZExt = false;
21810 Args.push_back(Entry);
21813 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21814 getPointerTy(DAG.getDataLayout()));
21816 TargetLowering::CallLoweringInfo CLI(DAG);
21817 CLI.setDebugLoc(dl)
21820 getLibcallCallingConv(LC),
21821 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21824 .setSExtResult(isSigned)
21825 .setZExtResult(!isSigned);
21827 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21828 return DAG.getBitcast(VT, CallInfo.first);
21831 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21832 SelectionDAG &DAG) {
21833 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21834 MVT VT = Op0.getSimpleValueType();
21837 // Decompose 256-bit ops into smaller 128-bit ops.
21838 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21839 unsigned Opcode = Op.getOpcode();
21840 unsigned NumElems = VT.getVectorNumElements();
21841 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21842 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21843 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21844 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21845 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21846 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21847 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21849 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21850 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21852 return DAG.getMergeValues(Ops, dl);
21855 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21856 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21858 // PMULxD operations multiply each even value (starting at 0) of LHS with
21859 // the related value of RHS and produce a widen result.
21860 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21861 // => <2 x i64> <ae|cg>
21863 // In other word, to have all the results, we need to perform two PMULxD:
21864 // 1. one with the even values.
21865 // 2. one with the odd values.
21866 // To achieve #2, with need to place the odd values at an even position.
21868 // Place the odd value at an even position (basically, shift all values 1
21869 // step to the left):
21870 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21871 // <a|b|c|d> => <b|undef|d|undef>
21872 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21873 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21874 // <e|f|g|h> => <f|undef|h|undef>
21875 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21876 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21878 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21880 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21881 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21883 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21884 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21885 // => <2 x i64> <ae|cg>
21886 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21887 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21888 // => <2 x i64> <bf|dh>
21889 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21891 // Shuffle it back into the right order.
21892 SDValue Highs, Lows;
21893 if (VT == MVT::v8i32) {
21894 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21895 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21896 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21897 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21899 const int HighMask[] = {1, 5, 3, 7};
21900 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21901 const int LowMask[] = {0, 4, 2, 6};
21902 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21905 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21906 // unsigned multiply.
21907 if (IsSigned && !Subtarget.hasSSE41()) {
21908 SDValue ShAmt = DAG.getConstant(
21910 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21911 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21912 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21913 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21914 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21916 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21917 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21920 // The first result of MUL_LOHI is actually the low value, followed by the
21922 SDValue Ops[] = {Lows, Highs};
21923 return DAG.getMergeValues(Ops, dl);
21926 // Return true if the required (according to Opcode) shift-imm form is natively
21927 // supported by the Subtarget
21928 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21930 if (VT.getScalarSizeInBits() < 16)
21933 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21934 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21937 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21938 (VT.is256BitVector() && Subtarget.hasInt256());
21940 bool AShift = LShift && (Subtarget.hasAVX512() ||
21941 (VT != MVT::v2i64 && VT != MVT::v4i64));
21942 return (Opcode == ISD::SRA) ? AShift : LShift;
21945 // The shift amount is a variable, but it is the same for all vector lanes.
21946 // These instructions are defined together with shift-immediate.
21948 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21950 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21953 // Return true if the required (according to Opcode) variable-shift form is
21954 // natively supported by the Subtarget
21955 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21958 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21961 // vXi16 supported only on AVX-512, BWI
21962 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21965 if (Subtarget.hasAVX512())
21968 bool LShift = VT.is128BitVector() || VT.is256BitVector();
21969 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
21970 return (Opcode == ISD::SRA) ? AShift : LShift;
21973 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
21974 const X86Subtarget &Subtarget) {
21975 MVT VT = Op.getSimpleValueType();
21977 SDValue R = Op.getOperand(0);
21978 SDValue Amt = Op.getOperand(1);
21980 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
21981 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
21983 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
21984 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
21985 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
21986 SDValue Ex = DAG.getBitcast(ExVT, R);
21988 // ashr(R, 63) === cmp_slt(R, 0)
21989 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
21990 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
21991 "Unsupported PCMPGT op");
21992 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
21993 getZeroVector(VT, Subtarget, DAG, dl), R);
21996 if (ShiftAmt >= 32) {
21997 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
21999 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22000 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22001 ShiftAmt - 32, DAG);
22002 if (VT == MVT::v2i64)
22003 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22004 if (VT == MVT::v4i64)
22005 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22006 {9, 1, 11, 3, 13, 5, 15, 7});
22008 // SRA upper i32, SHL whole i64 and select lower i32.
22009 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22012 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22013 Lower = DAG.getBitcast(ExVT, Lower);
22014 if (VT == MVT::v2i64)
22015 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22016 if (VT == MVT::v4i64)
22017 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22018 {8, 1, 10, 3, 12, 5, 14, 7});
22020 return DAG.getBitcast(VT, Ex);
22023 // Optimize shl/srl/sra with constant shift amount.
22024 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22025 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22026 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22028 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22029 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22031 // i64 SRA needs to be performed as partial shifts.
22032 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22033 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22034 Op.getOpcode() == ISD::SRA)
22035 return ArithmeticShiftRight64(ShiftAmt);
22037 if (VT == MVT::v16i8 ||
22038 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22039 VT == MVT::v64i8) {
22040 unsigned NumElts = VT.getVectorNumElements();
22041 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22043 // Simple i8 add case
22044 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22045 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22047 // ashr(R, 7) === cmp_slt(R, 0)
22048 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22049 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22050 if (VT.is512BitVector()) {
22051 assert(VT == MVT::v64i8 && "Unexpected element type!");
22052 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22053 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22055 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22058 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22059 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22062 if (Op.getOpcode() == ISD::SHL) {
22063 // Make a large shift.
22064 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22066 SHL = DAG.getBitcast(VT, SHL);
22067 // Zero out the rightmost bits.
22068 return DAG.getNode(ISD::AND, dl, VT, SHL,
22069 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22071 if (Op.getOpcode() == ISD::SRL) {
22072 // Make a large shift.
22073 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22075 SRL = DAG.getBitcast(VT, SRL);
22076 // Zero out the leftmost bits.
22077 return DAG.getNode(ISD::AND, dl, VT, SRL,
22078 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22080 if (Op.getOpcode() == ISD::SRA) {
22081 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22082 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22084 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22085 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22086 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22089 llvm_unreachable("Unknown shift opcode.");
22094 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22095 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22096 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
22097 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22098 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22100 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22101 unsigned SubVectorScale = 1;
22102 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22104 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22105 Amt = Amt.getOperand(0);
22108 // Peek through any splat that was introduced for i64 shift vectorization.
22109 int SplatIndex = -1;
22110 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22111 if (SVN->isSplat()) {
22112 SplatIndex = SVN->getSplatIndex();
22113 Amt = Amt.getOperand(0);
22114 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22115 "Splat shuffle referencing second operand");
22118 if (Amt.getOpcode() != ISD::BITCAST ||
22119 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22122 Amt = Amt.getOperand(0);
22123 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22124 (SubVectorScale * VT.getVectorNumElements());
22125 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22126 uint64_t ShiftAmt = 0;
22127 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22128 for (unsigned i = 0; i != Ratio; ++i) {
22129 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22133 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22136 // Check remaining shift amounts (if not a splat).
22137 if (SplatIndex < 0) {
22138 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22139 uint64_t ShAmt = 0;
22140 for (unsigned j = 0; j != Ratio; ++j) {
22141 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22145 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22147 if (ShAmt != ShiftAmt)
22152 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22153 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22155 if (Op.getOpcode() == ISD::SRA)
22156 return ArithmeticShiftRight64(ShiftAmt);
22162 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22163 const X86Subtarget &Subtarget) {
22164 MVT VT = Op.getSimpleValueType();
22166 SDValue R = Op.getOperand(0);
22167 SDValue Amt = Op.getOperand(1);
22169 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22170 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22172 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22173 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22175 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22177 MVT EltVT = VT.getVectorElementType();
22179 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22180 // Check if this build_vector node is doing a splat.
22181 // If so, then set BaseShAmt equal to the splat value.
22182 BaseShAmt = BV->getSplatValue();
22183 if (BaseShAmt && BaseShAmt.isUndef())
22184 BaseShAmt = SDValue();
22186 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22187 Amt = Amt.getOperand(0);
22189 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22190 if (SVN && SVN->isSplat()) {
22191 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22192 SDValue InVec = Amt.getOperand(0);
22193 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22194 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22195 "Unexpected shuffle index found!");
22196 BaseShAmt = InVec.getOperand(SplatIdx);
22197 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22198 if (ConstantSDNode *C =
22199 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22200 if (C->getZExtValue() == SplatIdx)
22201 BaseShAmt = InVec.getOperand(1);
22206 // Avoid introducing an extract element from a shuffle.
22207 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22208 DAG.getIntPtrConstant(SplatIdx, dl));
22212 if (BaseShAmt.getNode()) {
22213 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22214 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22215 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22216 else if (EltVT.bitsLT(MVT::i32))
22217 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22219 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22223 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22224 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22225 Amt.getOpcode() == ISD::BITCAST &&
22226 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22227 Amt = Amt.getOperand(0);
22228 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22229 VT.getVectorNumElements();
22230 std::vector<SDValue> Vals(Ratio);
22231 for (unsigned i = 0; i != Ratio; ++i)
22232 Vals[i] = Amt.getOperand(i);
22233 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22234 for (unsigned j = 0; j != Ratio; ++j)
22235 if (Vals[j] != Amt.getOperand(i + j))
22239 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22240 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22245 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22246 SelectionDAG &DAG) {
22247 MVT VT = Op.getSimpleValueType();
22249 SDValue R = Op.getOperand(0);
22250 SDValue Amt = Op.getOperand(1);
22251 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22253 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22254 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22256 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22259 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22262 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22265 // XOP has 128-bit variable logical/arithmetic shifts.
22266 // +ve/-ve Amt = shift left/right.
22267 if (Subtarget.hasXOP() &&
22268 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22269 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22270 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22271 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22272 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22274 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22275 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22276 if (Op.getOpcode() == ISD::SRA)
22277 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22280 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22281 // shifts per-lane and then shuffle the partial results back together.
22282 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22283 // Splat the shift amounts so the scalar shifts above will catch it.
22284 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22285 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22286 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22287 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22288 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22291 // i64 vector arithmetic shift can be emulated with the transform:
22292 // M = lshr(SIGN_MASK, Amt)
22293 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22294 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22295 Op.getOpcode() == ISD::SRA) {
22296 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22297 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22298 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22299 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22300 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22304 // If possible, lower this packed shift into a vector multiply instead of
22305 // expanding it into a sequence of scalar shifts.
22306 // Do this only if the vector shift count is a constant build_vector.
22307 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22308 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22309 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22310 SmallVector<SDValue, 8> Elts;
22311 MVT SVT = VT.getVectorElementType();
22312 unsigned SVTBits = SVT.getSizeInBits();
22313 APInt One(SVTBits, 1);
22314 unsigned NumElems = VT.getVectorNumElements();
22316 for (unsigned i=0; i !=NumElems; ++i) {
22317 SDValue Op = Amt->getOperand(i);
22318 if (Op->isUndef()) {
22319 Elts.push_back(Op);
22323 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22324 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22325 uint64_t ShAmt = C.getZExtValue();
22326 if (ShAmt >= SVTBits) {
22327 Elts.push_back(DAG.getUNDEF(SVT));
22330 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22332 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22333 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22336 // Lower SHL with variable shift amount.
22337 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22338 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22340 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22341 DAG.getConstant(0x3f800000U, dl, VT));
22342 Op = DAG.getBitcast(MVT::v4f32, Op);
22343 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22344 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22347 // If possible, lower this shift as a sequence of two shifts by
22348 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22350 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22352 // Could be rewritten as:
22353 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22355 // The advantage is that the two shifts from the example would be
22356 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22357 // the vector shift into four scalar shifts plus four pairs of vector
22359 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22360 unsigned TargetOpcode = X86ISD::MOVSS;
22361 bool CanBeSimplified;
22362 // The splat value for the first packed shift (the 'X' from the example).
22363 SDValue Amt1 = Amt->getOperand(0);
22364 // The splat value for the second packed shift (the 'Y' from the example).
22365 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22367 // See if it is possible to replace this node with a sequence of
22368 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22369 if (VT == MVT::v4i32) {
22370 // Check if it is legal to use a MOVSS.
22371 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22372 Amt2 == Amt->getOperand(3);
22373 if (!CanBeSimplified) {
22374 // Otherwise, check if we can still simplify this node using a MOVSD.
22375 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22376 Amt->getOperand(2) == Amt->getOperand(3);
22377 TargetOpcode = X86ISD::MOVSD;
22378 Amt2 = Amt->getOperand(2);
22381 // Do similar checks for the case where the machine value type
22383 CanBeSimplified = Amt1 == Amt->getOperand(1);
22384 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22385 CanBeSimplified = Amt2 == Amt->getOperand(i);
22387 if (!CanBeSimplified) {
22388 TargetOpcode = X86ISD::MOVSD;
22389 CanBeSimplified = true;
22390 Amt2 = Amt->getOperand(4);
22391 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22392 CanBeSimplified = Amt1 == Amt->getOperand(i);
22393 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22394 CanBeSimplified = Amt2 == Amt->getOperand(j);
22398 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22399 isa<ConstantSDNode>(Amt2)) {
22400 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22401 MVT CastVT = MVT::v4i32;
22403 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22404 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22406 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22407 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22408 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22409 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22410 if (TargetOpcode == X86ISD::MOVSD)
22411 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22412 BitCast2, {0, 1, 6, 7}));
22413 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22414 BitCast2, {0, 5, 6, 7}));
22418 // v4i32 Non Uniform Shifts.
22419 // If the shift amount is constant we can shift each lane using the SSE2
22420 // immediate shifts, else we need to zero-extend each lane to the lower i64
22421 // and shift using the SSE2 variable shifts.
22422 // The separate results can then be blended together.
22423 if (VT == MVT::v4i32) {
22424 unsigned Opc = Op.getOpcode();
22425 SDValue Amt0, Amt1, Amt2, Amt3;
22427 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22428 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22429 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22430 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22432 // ISD::SHL is handled above but we include it here for completeness.
22435 llvm_unreachable("Unknown target vector shift node");
22437 Opc = X86ISD::VSHL;
22440 Opc = X86ISD::VSRL;
22443 Opc = X86ISD::VSRA;
22446 // The SSE2 shifts use the lower i64 as the same shift amount for
22447 // all lanes and the upper i64 is ignored. These shuffle masks
22448 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22449 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22450 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22451 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22452 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22453 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22456 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22457 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22458 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22459 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22460 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22461 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22462 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22465 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22466 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22467 // make the existing SSE solution better.
22468 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22469 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22470 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22471 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22472 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22473 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22475 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22476 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22477 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22478 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22479 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22482 if (VT == MVT::v16i8 ||
22483 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22484 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22485 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22486 unsigned ShiftOpcode = Op->getOpcode();
22488 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22489 if (VT.is512BitVector()) {
22490 // On AVX512BW targets we make use of the fact that VSELECT lowers
22491 // to a masked blend which selects bytes based just on the sign bit
22492 // extracted to a mask.
22493 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22494 V0 = DAG.getBitcast(VT, V0);
22495 V1 = DAG.getBitcast(VT, V1);
22496 Sel = DAG.getBitcast(VT, Sel);
22497 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22498 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22499 } else if (Subtarget.hasSSE41()) {
22500 // On SSE41 targets we make use of the fact that VSELECT lowers
22501 // to PBLENDVB which selects bytes based just on the sign bit.
22502 V0 = DAG.getBitcast(VT, V0);
22503 V1 = DAG.getBitcast(VT, V1);
22504 Sel = DAG.getBitcast(VT, Sel);
22505 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22507 // On pre-SSE41 targets we test for the sign bit by comparing to
22508 // zero - a negative value will set all bits of the lanes to true
22509 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22510 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22511 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22512 return DAG.getSelect(dl, SelVT, C, V0, V1);
22515 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22516 // We can safely do this using i16 shifts as we're only interested in
22517 // the 3 lower bits of each byte.
22518 Amt = DAG.getBitcast(ExtVT, Amt);
22519 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22520 Amt = DAG.getBitcast(VT, Amt);
22522 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22523 // r = VSELECT(r, shift(r, 4), a);
22525 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22526 R = SignBitSelect(VT, Amt, M, R);
22529 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22531 // r = VSELECT(r, shift(r, 2), a);
22532 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22533 R = SignBitSelect(VT, Amt, M, R);
22536 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22538 // return VSELECT(r, shift(r, 1), a);
22539 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22540 R = SignBitSelect(VT, Amt, M, R);
22544 if (Op->getOpcode() == ISD::SRA) {
22545 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22546 // so we can correctly sign extend. We don't care what happens to the
22548 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22549 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22550 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22551 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22552 ALo = DAG.getBitcast(ExtVT, ALo);
22553 AHi = DAG.getBitcast(ExtVT, AHi);
22554 RLo = DAG.getBitcast(ExtVT, RLo);
22555 RHi = DAG.getBitcast(ExtVT, RHi);
22557 // r = VSELECT(r, shift(r, 4), a);
22558 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22559 DAG.getConstant(4, dl, ExtVT));
22560 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22561 DAG.getConstant(4, dl, ExtVT));
22562 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22563 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22566 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22567 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22569 // r = VSELECT(r, shift(r, 2), a);
22570 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22571 DAG.getConstant(2, dl, ExtVT));
22572 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22573 DAG.getConstant(2, dl, ExtVT));
22574 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22575 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22578 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22579 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22581 // r = VSELECT(r, shift(r, 1), a);
22582 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22583 DAG.getConstant(1, dl, ExtVT));
22584 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22585 DAG.getConstant(1, dl, ExtVT));
22586 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22587 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22589 // Logical shift the result back to the lower byte, leaving a zero upper
22591 // meaning that we can safely pack with PACKUSWB.
22593 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22595 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22596 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22600 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22601 MVT ExtVT = MVT::v8i32;
22602 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22603 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22604 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22605 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22606 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22607 ALo = DAG.getBitcast(ExtVT, ALo);
22608 AHi = DAG.getBitcast(ExtVT, AHi);
22609 RLo = DAG.getBitcast(ExtVT, RLo);
22610 RHi = DAG.getBitcast(ExtVT, RHi);
22611 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22612 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22613 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22614 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22615 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22618 if (VT == MVT::v8i16) {
22619 unsigned ShiftOpcode = Op->getOpcode();
22621 // If we have a constant shift amount, the non-SSE41 path is best as
22622 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22623 bool UseSSE41 = Subtarget.hasSSE41() &&
22624 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22626 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22627 // On SSE41 targets we make use of the fact that VSELECT lowers
22628 // to PBLENDVB which selects bytes based just on the sign bit.
22630 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22631 V0 = DAG.getBitcast(ExtVT, V0);
22632 V1 = DAG.getBitcast(ExtVT, V1);
22633 Sel = DAG.getBitcast(ExtVT, Sel);
22634 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22636 // On pre-SSE41 targets we splat the sign bit - a negative value will
22637 // set all bits of the lanes to true and VSELECT uses that in
22638 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22640 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22641 return DAG.getSelect(dl, VT, C, V0, V1);
22644 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22646 // On SSE41 targets we need to replicate the shift mask in both
22647 // bytes for PBLENDVB.
22650 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22651 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22653 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22656 // r = VSELECT(r, shift(r, 8), a);
22657 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22658 R = SignBitSelect(Amt, M, R);
22661 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22663 // r = VSELECT(r, shift(r, 4), a);
22664 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22665 R = SignBitSelect(Amt, M, R);
22668 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22670 // r = VSELECT(r, shift(r, 2), a);
22671 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22672 R = SignBitSelect(Amt, M, R);
22675 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22677 // return VSELECT(r, shift(r, 1), a);
22678 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22679 R = SignBitSelect(Amt, M, R);
22683 // Decompose 256-bit shifts into smaller 128-bit shifts.
22684 if (VT.is256BitVector())
22685 return Lower256IntArith(Op, DAG);
22690 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22691 SelectionDAG &DAG) {
22692 MVT VT = Op.getSimpleValueType();
22694 SDValue R = Op.getOperand(0);
22695 SDValue Amt = Op.getOperand(1);
22696 unsigned Opcode = Op.getOpcode();
22697 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22699 if (Subtarget.hasAVX512()) {
22700 // Attempt to rotate by immediate.
22702 SmallVector<APInt, 16> EltBits;
22703 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
22704 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
22705 return EltBits[0] == V;
22707 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
22708 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
22709 return DAG.getNode(Op, DL, VT, R,
22710 DAG.getConstant(RotateAmt, DL, MVT::i8));
22714 // Else, fall-back on VPROLV/VPRORV.
22718 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22719 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22720 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
22722 // XOP has 128-bit vector variable + immediate rotates.
22723 // +ve/-ve Amt = rotate left/right.
22725 // Split 256-bit integers.
22726 if (VT.is256BitVector())
22727 return Lower256IntArith(Op, DAG);
22729 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22731 // Attempt to rotate by immediate.
22732 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22733 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22734 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22735 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
22736 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22737 DAG.getConstant(RotateAmt, DL, MVT::i8));
22741 // Use general rotate by variable (per-element).
22742 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22745 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22746 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22747 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22748 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22749 // has only one use.
22750 SDNode *N = Op.getNode();
22751 SDValue LHS = N->getOperand(0);
22752 SDValue RHS = N->getOperand(1);
22753 unsigned BaseOp = 0;
22754 X86::CondCode Cond;
22756 switch (Op.getOpcode()) {
22757 default: llvm_unreachable("Unknown ovf instruction!");
22759 // A subtract of one will be selected as a INC. Note that INC doesn't
22760 // set CF, so we can't do this for UADDO.
22761 if (isOneConstant(RHS)) {
22762 BaseOp = X86ISD::INC;
22763 Cond = X86::COND_O;
22766 BaseOp = X86ISD::ADD;
22767 Cond = X86::COND_O;
22770 BaseOp = X86ISD::ADD;
22771 Cond = X86::COND_B;
22774 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22775 // set CF, so we can't do this for USUBO.
22776 if (isOneConstant(RHS)) {
22777 BaseOp = X86ISD::DEC;
22778 Cond = X86::COND_O;
22781 BaseOp = X86ISD::SUB;
22782 Cond = X86::COND_O;
22785 BaseOp = X86ISD::SUB;
22786 Cond = X86::COND_B;
22789 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22790 Cond = X86::COND_O;
22792 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22793 if (N->getValueType(0) == MVT::i8) {
22794 BaseOp = X86ISD::UMUL8;
22795 Cond = X86::COND_O;
22798 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22800 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22802 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22804 if (N->getValueType(1) == MVT::i1)
22805 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22807 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22811 // Also sets EFLAGS.
22812 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22813 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22815 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22817 if (N->getValueType(1) == MVT::i1)
22818 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22820 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22823 /// Returns true if the operand type is exactly twice the native width, and
22824 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22825 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22826 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22827 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22828 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22831 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22832 else if (OpWidth == 128)
22833 return Subtarget.hasCmpxchg16b();
22838 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22839 return needsCmpXchgNb(SI->getValueOperand()->getType());
22842 // Note: this turns large loads into lock cmpxchg8b/16b.
22843 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22844 TargetLowering::AtomicExpansionKind
22845 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22846 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22847 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22848 : AtomicExpansionKind::None;
22851 TargetLowering::AtomicExpansionKind
22852 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22853 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22854 Type *MemType = AI->getType();
22856 // If the operand is too big, we must see if cmpxchg8/16b is available
22857 // and default to library calls otherwise.
22858 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22859 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22860 : AtomicExpansionKind::None;
22863 AtomicRMWInst::BinOp Op = AI->getOperation();
22866 llvm_unreachable("Unknown atomic operation");
22867 case AtomicRMWInst::Xchg:
22868 case AtomicRMWInst::Add:
22869 case AtomicRMWInst::Sub:
22870 // It's better to use xadd, xsub or xchg for these in all cases.
22871 return AtomicExpansionKind::None;
22872 case AtomicRMWInst::Or:
22873 case AtomicRMWInst::And:
22874 case AtomicRMWInst::Xor:
22875 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22876 // prefix to a normal instruction for these operations.
22877 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22878 : AtomicExpansionKind::None;
22879 case AtomicRMWInst::Nand:
22880 case AtomicRMWInst::Max:
22881 case AtomicRMWInst::Min:
22882 case AtomicRMWInst::UMax:
22883 case AtomicRMWInst::UMin:
22884 // These always require a non-trivial set of data operations on x86. We must
22885 // use a cmpxchg loop.
22886 return AtomicExpansionKind::CmpXChg;
22891 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22892 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22893 Type *MemType = AI->getType();
22894 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22895 // there is no benefit in turning such RMWs into loads, and it is actually
22896 // harmful as it introduces a mfence.
22897 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22900 auto Builder = IRBuilder<>(AI);
22901 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22902 auto SSID = AI->getSyncScopeID();
22903 // We must restrict the ordering to avoid generating loads with Release or
22904 // ReleaseAcquire orderings.
22905 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22906 auto Ptr = AI->getPointerOperand();
22908 // Before the load we need a fence. Here is an example lifted from
22909 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22912 // x.store(1, relaxed);
22913 // r1 = y.fetch_add(0, release);
22915 // y.fetch_add(42, acquire);
22916 // r2 = x.load(relaxed);
22917 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22918 // lowered to just a load without a fence. A mfence flushes the store buffer,
22919 // making the optimization clearly correct.
22920 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22921 // otherwise, we might be able to be more aggressive on relaxed idempotent
22922 // rmw. In practice, they do not look useful, so we don't try to be
22923 // especially clever.
22924 if (SSID == SyncScope::SingleThread)
22925 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22926 // the IR level, so we must wrap it in an intrinsic.
22929 if (!Subtarget.hasMFence())
22930 // FIXME: it might make sense to use a locked operation here but on a
22931 // different cache-line to prevent cache-line bouncing. In practice it
22932 // is probably a small win, and x86 processors without mfence are rare
22933 // enough that we do not bother.
22937 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22938 Builder.CreateCall(MFence, {});
22940 // Finally we can emit the atomic load.
22941 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22942 AI->getType()->getPrimitiveSizeInBits());
22943 Loaded->setAtomic(Order, SSID);
22944 AI->replaceAllUsesWith(Loaded);
22945 AI->eraseFromParent();
22949 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22950 SelectionDAG &DAG) {
22952 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22953 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22954 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
22955 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22957 // The only fence that needs an instruction is a sequentially-consistent
22958 // cross-thread fence.
22959 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22960 FenceSSID == SyncScope::System) {
22961 if (Subtarget.hasMFence())
22962 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22964 SDValue Chain = Op.getOperand(0);
22965 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22967 DAG.getRegister(X86::ESP, MVT::i32), // Base
22968 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
22969 DAG.getRegister(0, MVT::i32), // Index
22970 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
22971 DAG.getRegister(0, MVT::i32), // Segment.
22975 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
22976 return SDValue(Res, 0);
22979 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
22980 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
22983 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
22984 SelectionDAG &DAG) {
22985 MVT T = Op.getSimpleValueType();
22989 switch(T.SimpleTy) {
22990 default: llvm_unreachable("Invalid value type!");
22991 case MVT::i8: Reg = X86::AL; size = 1; break;
22992 case MVT::i16: Reg = X86::AX; size = 2; break;
22993 case MVT::i32: Reg = X86::EAX; size = 4; break;
22995 assert(Subtarget.is64Bit() && "Node not type legal!");
22996 Reg = X86::RAX; size = 8;
22999 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23000 Op.getOperand(2), SDValue());
23001 SDValue Ops[] = { cpIn.getValue(0),
23004 DAG.getTargetConstant(size, DL, MVT::i8),
23005 cpIn.getValue(1) };
23006 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23007 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23008 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23012 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23013 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23014 MVT::i32, cpOut.getValue(2));
23015 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23017 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23018 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23019 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23023 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23024 SelectionDAG &DAG) {
23025 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23026 MVT DstVT = Op.getSimpleValueType();
23028 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23029 SrcVT == MVT::i64) {
23030 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23031 if (DstVT != MVT::f64)
23032 // This conversion needs to be expanded.
23035 SDValue Op0 = Op->getOperand(0);
23036 SmallVector<SDValue, 16> Elts;
23040 if (SrcVT.isVector()) {
23041 NumElts = SrcVT.getVectorNumElements();
23042 SVT = SrcVT.getVectorElementType();
23044 // Widen the vector in input in the case of MVT::v2i32.
23045 // Example: from MVT::v2i32 to MVT::v4i32.
23046 for (unsigned i = 0, e = NumElts; i != e; ++i)
23047 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23048 DAG.getIntPtrConstant(i, dl)));
23050 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23051 "Unexpected source type in LowerBITCAST");
23052 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23053 DAG.getIntPtrConstant(0, dl)));
23054 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23055 DAG.getIntPtrConstant(1, dl)));
23059 // Explicitly mark the extra elements as Undef.
23060 Elts.append(NumElts, DAG.getUNDEF(SVT));
23062 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23063 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23064 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23065 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23066 DAG.getIntPtrConstant(0, dl));
23069 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23070 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23071 assert((DstVT == MVT::i64 ||
23072 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23073 "Unexpected custom BITCAST");
23074 // i64 <=> MMX conversions are Legal.
23075 if (SrcVT==MVT::i64 && DstVT.isVector())
23077 if (DstVT==MVT::i64 && SrcVT.isVector())
23079 // MMX <=> MMX conversions are Legal.
23080 if (SrcVT.isVector() && DstVT.isVector())
23082 // All other conversions need to be expanded.
23086 /// Compute the horizontal sum of bytes in V for the elements of VT.
23088 /// Requires V to be a byte vector and VT to be an integer vector type with
23089 /// wider elements than V's type. The width of the elements of VT determines
23090 /// how many bytes of V are summed horizontally to produce each element of the
23092 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23093 const X86Subtarget &Subtarget,
23094 SelectionDAG &DAG) {
23096 MVT ByteVecVT = V.getSimpleValueType();
23097 MVT EltVT = VT.getVectorElementType();
23098 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23099 "Expected value to have byte element type.");
23100 assert(EltVT != MVT::i8 &&
23101 "Horizontal byte sum only makes sense for wider elements!");
23102 unsigned VecSize = VT.getSizeInBits();
23103 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23105 // PSADBW instruction horizontally add all bytes and leave the result in i64
23106 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23107 if (EltVT == MVT::i64) {
23108 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23109 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23110 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23111 return DAG.getBitcast(VT, V);
23114 if (EltVT == MVT::i32) {
23115 // We unpack the low half and high half into i32s interleaved with zeros so
23116 // that we can use PSADBW to horizontally sum them. The most useful part of
23117 // this is that it lines up the results of two PSADBW instructions to be
23118 // two v2i64 vectors which concatenated are the 4 population counts. We can
23119 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23120 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23121 SDValue V32 = DAG.getBitcast(VT, V);
23122 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23123 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23125 // Do the horizontal sums into two v2i64s.
23126 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23127 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23128 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23129 DAG.getBitcast(ByteVecVT, Low), Zeros);
23130 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23131 DAG.getBitcast(ByteVecVT, High), Zeros);
23133 // Merge them together.
23134 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23135 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23136 DAG.getBitcast(ShortVecVT, Low),
23137 DAG.getBitcast(ShortVecVT, High));
23139 return DAG.getBitcast(VT, V);
23142 // The only element type left is i16.
23143 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23145 // To obtain pop count for each i16 element starting from the pop count for
23146 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23147 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23148 // directly supported.
23149 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23150 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23151 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23152 DAG.getBitcast(ByteVecVT, V));
23153 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23156 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23157 const X86Subtarget &Subtarget,
23158 SelectionDAG &DAG) {
23159 MVT VT = Op.getSimpleValueType();
23160 MVT EltVT = VT.getVectorElementType();
23161 unsigned VecSize = VT.getSizeInBits();
23163 // Implement a lookup table in register by using an algorithm based on:
23164 // http://wm.ite.pl/articles/sse-popcount.html
23166 // The general idea is that every lower byte nibble in the input vector is an
23167 // index into a in-register pre-computed pop count table. We then split up the
23168 // input vector in two new ones: (1) a vector with only the shifted-right
23169 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23170 // masked out higher ones) for each byte. PSHUFB is used separately with both
23171 // to index the in-register table. Next, both are added and the result is a
23172 // i8 vector where each element contains the pop count for input byte.
23174 // To obtain the pop count for elements != i8, we follow up with the same
23175 // approach and use additional tricks as described below.
23177 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23178 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23179 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23180 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23182 int NumByteElts = VecSize / 8;
23183 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23184 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23185 SmallVector<SDValue, 64> LUTVec;
23186 for (int i = 0; i < NumByteElts; ++i)
23187 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23188 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23189 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23192 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23193 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23196 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23198 // The input vector is used as the shuffle mask that index elements into the
23199 // LUT. After counting low and high nibbles, add the vector to obtain the
23200 // final pop count per i8 element.
23201 SDValue HighPopCnt =
23202 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23203 SDValue LowPopCnt =
23204 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23205 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23207 if (EltVT == MVT::i8)
23210 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23213 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23214 const X86Subtarget &Subtarget,
23215 SelectionDAG &DAG) {
23216 MVT VT = Op.getSimpleValueType();
23217 assert(VT.is128BitVector() &&
23218 "Only 128-bit vector bitmath lowering supported.");
23220 int VecSize = VT.getSizeInBits();
23221 MVT EltVT = VT.getVectorElementType();
23222 int Len = EltVT.getSizeInBits();
23224 // This is the vectorized version of the "best" algorithm from
23225 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23226 // with a minor tweak to use a series of adds + shifts instead of vector
23227 // multiplications. Implemented for all integer vector types. We only use
23228 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23229 // much faster, even faster than using native popcnt instructions.
23231 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23232 MVT VT = V.getSimpleValueType();
23233 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23234 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23236 auto GetMask = [&](SDValue V, APInt Mask) {
23237 MVT VT = V.getSimpleValueType();
23238 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23239 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23242 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23243 // x86, so set the SRL type to have elements at least i16 wide. This is
23244 // correct because all of our SRLs are followed immediately by a mask anyways
23245 // that handles any bits that sneak into the high bits of the byte elements.
23246 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23250 // v = v - ((v >> 1) & 0x55555555...)
23252 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23253 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23254 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23256 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23257 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23258 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23259 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23260 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23262 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23263 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23264 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23265 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23267 // At this point, V contains the byte-wise population count, and we are
23268 // merely doing a horizontal sum if necessary to get the wider element
23270 if (EltVT == MVT::i8)
23273 return LowerHorizontalByteSum(
23274 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23278 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23279 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23280 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23281 SelectionDAG &DAG) {
23282 MVT VT = Op.getSimpleValueType();
23283 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23284 "Unknown CTPOP type to handle");
23285 SDLoc DL(Op.getNode());
23286 SDValue Op0 = Op.getOperand(0);
23288 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23289 if (Subtarget.hasVPOPCNTDQ()) {
23290 if (VT == MVT::v8i16) {
23291 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23292 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23293 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23295 if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23296 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23297 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23298 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23302 if (!Subtarget.hasSSSE3()) {
23303 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23304 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23305 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23308 // Decompose 256-bit ops into smaller 128-bit ops.
23309 if (VT.is256BitVector() && !Subtarget.hasInt256())
23310 return Lower256IntUnary(Op, DAG);
23312 // Decompose 512-bit ops into smaller 256-bit ops.
23313 if (VT.is512BitVector() && !Subtarget.hasBWI())
23314 return Lower512IntUnary(Op, DAG);
23316 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23319 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23320 SelectionDAG &DAG) {
23321 assert(Op.getSimpleValueType().isVector() &&
23322 "We only do custom lowering for vector population count.");
23323 return LowerVectorCTPOP(Op, Subtarget, DAG);
23326 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23327 MVT VT = Op.getSimpleValueType();
23328 SDValue In = Op.getOperand(0);
23331 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23332 // perform the BITREVERSE.
23333 if (!VT.isVector()) {
23334 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23335 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23336 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23337 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23338 DAG.getIntPtrConstant(0, DL));
23341 int NumElts = VT.getVectorNumElements();
23342 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23344 // Decompose 256-bit ops into smaller 128-bit ops.
23345 if (VT.is256BitVector())
23346 return Lower256IntUnary(Op, DAG);
23348 assert(VT.is128BitVector() &&
23349 "Only 128-bit vector bitreverse lowering supported.");
23351 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23352 // perform the BSWAP in the shuffle.
23353 // Its best to shuffle using the second operand as this will implicitly allow
23354 // memory folding for multiple vectors.
23355 SmallVector<SDValue, 16> MaskElts;
23356 for (int i = 0; i != NumElts; ++i) {
23357 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23358 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23359 int PermuteByte = SourceByte | (2 << 5);
23360 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23364 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23365 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23366 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23368 return DAG.getBitcast(VT, Res);
23371 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23372 SelectionDAG &DAG) {
23373 if (Subtarget.hasXOP())
23374 return LowerBITREVERSE_XOP(Op, DAG);
23376 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23378 MVT VT = Op.getSimpleValueType();
23379 SDValue In = Op.getOperand(0);
23382 unsigned NumElts = VT.getVectorNumElements();
23383 assert(VT.getScalarType() == MVT::i8 &&
23384 "Only byte vector BITREVERSE supported");
23386 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23387 if (VT.is256BitVector() && !Subtarget.hasInt256())
23388 return Lower256IntUnary(Op, DAG);
23390 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23391 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23392 // 0-15 value (moved to the other nibble).
23393 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23394 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23395 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23397 const int LoLUT[16] = {
23398 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23399 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23400 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23401 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23402 const int HiLUT[16] = {
23403 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23404 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23405 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23406 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23408 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23409 for (unsigned i = 0; i < NumElts; ++i) {
23410 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23411 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23414 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23415 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23416 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23417 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23418 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23421 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23422 unsigned NewOpc = 0;
23423 switch (N->getOpcode()) {
23424 case ISD::ATOMIC_LOAD_ADD:
23425 NewOpc = X86ISD::LADD;
23427 case ISD::ATOMIC_LOAD_SUB:
23428 NewOpc = X86ISD::LSUB;
23430 case ISD::ATOMIC_LOAD_OR:
23431 NewOpc = X86ISD::LOR;
23433 case ISD::ATOMIC_LOAD_XOR:
23434 NewOpc = X86ISD::LXOR;
23436 case ISD::ATOMIC_LOAD_AND:
23437 NewOpc = X86ISD::LAND;
23440 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23443 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23444 return DAG.getMemIntrinsicNode(
23445 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23446 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23447 /*MemVT=*/N->getSimpleValueType(0), MMO);
23450 /// Lower atomic_load_ops into LOCK-prefixed operations.
23451 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23452 const X86Subtarget &Subtarget) {
23453 SDValue Chain = N->getOperand(0);
23454 SDValue LHS = N->getOperand(1);
23455 SDValue RHS = N->getOperand(2);
23456 unsigned Opc = N->getOpcode();
23457 MVT VT = N->getSimpleValueType(0);
23460 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23461 // can only be lowered when the result is unused. They should have already
23462 // been transformed into a cmpxchg loop in AtomicExpand.
23463 if (N->hasAnyUseOfValue(0)) {
23464 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23465 // select LXADD if LOCK_SUB can't be selected.
23466 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23467 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23468 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23469 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23470 RHS, AN->getMemOperand());
23472 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23473 "Used AtomicRMW ops other than Add should have been expanded!");
23477 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23478 // RAUW the chain, but don't worry about the result, as it's unused.
23479 assert(!N->hasAnyUseOfValue(0));
23480 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23484 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23485 SDNode *Node = Op.getNode();
23487 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23489 // Convert seq_cst store -> xchg
23490 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23491 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23492 // (The only way to get a 16-byte store is cmpxchg16b)
23493 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23494 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23495 AtomicOrdering::SequentiallyConsistent ||
23496 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23497 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23498 cast<AtomicSDNode>(Node)->getMemoryVT(),
23499 Node->getOperand(0),
23500 Node->getOperand(1), Node->getOperand(2),
23501 cast<AtomicSDNode>(Node)->getMemOperand());
23502 return Swap.getValue(1);
23504 // Other atomic stores have a simple pattern.
23508 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23509 SDNode *N = Op.getNode();
23510 MVT VT = N->getSimpleValueType(0);
23512 // Let legalize expand this if it isn't a legal type yet.
23513 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23516 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23519 // Set the carry flag.
23520 SDValue Carry = Op.getOperand(2);
23521 EVT CarryVT = Carry.getValueType();
23522 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23523 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23524 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23526 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23527 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23528 Op.getOperand(1), Carry.getValue(1));
23530 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23531 if (N->getValueType(1) == MVT::i1)
23532 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23534 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23537 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23538 SelectionDAG &DAG) {
23539 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23541 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23542 // which returns the values as { float, float } (in XMM0) or
23543 // { double, double } (which is returned in XMM0, XMM1).
23545 SDValue Arg = Op.getOperand(0);
23546 EVT ArgVT = Arg.getValueType();
23547 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23549 TargetLowering::ArgListTy Args;
23550 TargetLowering::ArgListEntry Entry;
23554 Entry.IsSExt = false;
23555 Entry.IsZExt = false;
23556 Args.push_back(Entry);
23558 bool isF64 = ArgVT == MVT::f64;
23559 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23560 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23561 // the results are returned via SRet in memory.
23562 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23563 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23565 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23567 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23568 : (Type *)VectorType::get(ArgTy, 4);
23570 TargetLowering::CallLoweringInfo CLI(DAG);
23571 CLI.setDebugLoc(dl)
23572 .setChain(DAG.getEntryNode())
23573 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23575 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23578 // Returned in xmm0 and xmm1.
23579 return CallResult.first;
23581 // Returned in bits 0:31 and 32:64 xmm0.
23582 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23583 CallResult.first, DAG.getIntPtrConstant(0, dl));
23584 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23585 CallResult.first, DAG.getIntPtrConstant(1, dl));
23586 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23587 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23590 /// Widen a vector input to a vector of NVT. The
23591 /// input vector must have the same element type as NVT.
23592 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23593 bool FillWithZeroes = false) {
23594 // Check if InOp already has the right width.
23595 MVT InVT = InOp.getSimpleValueType();
23599 if (InOp.isUndef())
23600 return DAG.getUNDEF(NVT);
23602 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23603 "input and widen element type must match");
23605 unsigned InNumElts = InVT.getVectorNumElements();
23606 unsigned WidenNumElts = NVT.getVectorNumElements();
23607 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23608 "Unexpected request for vector widening");
23611 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23612 InOp.getNumOperands() == 2) {
23613 SDValue N1 = InOp.getOperand(1);
23614 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23616 InOp = InOp.getOperand(0);
23617 InVT = InOp.getSimpleValueType();
23618 InNumElts = InVT.getVectorNumElements();
23621 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23622 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23623 SmallVector<SDValue, 16> Ops;
23624 for (unsigned i = 0; i < InNumElts; ++i)
23625 Ops.push_back(InOp.getOperand(i));
23627 EVT EltVT = InOp.getOperand(0).getValueType();
23629 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23630 DAG.getUNDEF(EltVT);
23631 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23632 Ops.push_back(FillVal);
23633 return DAG.getBuildVector(NVT, dl, Ops);
23635 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23637 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23638 InOp, DAG.getIntPtrConstant(0, dl));
23641 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23642 SelectionDAG &DAG) {
23643 assert(Subtarget.hasAVX512() &&
23644 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23646 // X86 scatter kills mask register, so its type should be added to
23647 // the list of return values.
23648 // If the "scatter" has 2 return values, it is already handled.
23649 if (Op.getNode()->getNumValues() == 2)
23652 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23653 SDValue Src = N->getValue();
23654 MVT VT = Src.getSimpleValueType();
23655 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23658 SDValue NewScatter;
23659 SDValue Index = N->getIndex();
23660 SDValue Mask = N->getMask();
23661 SDValue Chain = N->getChain();
23662 SDValue BasePtr = N->getBasePtr();
23663 MVT MemVT = N->getMemoryVT().getSimpleVT();
23664 MVT IndexVT = Index.getSimpleValueType();
23665 MVT MaskVT = Mask.getSimpleValueType();
23667 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23668 // The v2i32 value was promoted to v2i64.
23669 // Now we "redo" the type legalizer's work and widen the original
23670 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23672 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23673 "Unexpected memory type");
23674 int ShuffleMask[] = {0, 2, -1, -1};
23675 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23676 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23677 // Now we have 4 elements instead of 2.
23678 // Expand the index.
23679 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23680 Index = ExtendToType(Index, NewIndexVT, DAG);
23682 // Expand the mask with zeroes
23683 // Mask may be <2 x i64> or <2 x i1> at this moment
23684 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23685 "Unexpected mask type");
23686 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23687 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23691 unsigned NumElts = VT.getVectorNumElements();
23692 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23693 !Index.getSimpleValueType().is512BitVector()) {
23694 // AVX512F supports only 512-bit vectors. Or data or index should
23695 // be 512 bit wide. If now the both index and data are 256-bit, but
23696 // the vector contains 8 elements, we just sign-extend the index
23697 if (IndexVT == MVT::v8i32)
23698 // Just extend index
23699 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23701 // The minimal number of elts in scatter is 8
23704 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23705 // Use original index here, do not modify the index twice
23706 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23707 if (IndexVT.getScalarType() == MVT::i32)
23708 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23711 // At this point we have promoted mask operand
23712 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23713 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23714 // Use the original mask here, do not modify the mask twice
23715 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23717 // The value that should be stored
23718 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23719 Src = ExtendToType(Src, NewVT, DAG);
23722 // If the mask is "wide" at this point - truncate it to i1 vector
23723 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23724 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23726 // The mask is killed by scatter, add it to the values
23727 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23728 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23729 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23730 N->getMemOperand());
23731 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23732 return SDValue(NewScatter.getNode(), 1);
23735 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23736 SelectionDAG &DAG) {
23738 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23739 MVT VT = Op.getSimpleValueType();
23740 MVT ScalarVT = VT.getScalarType();
23741 SDValue Mask = N->getMask();
23744 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23745 "Expanding masked load is supported on AVX-512 target only!");
23747 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23748 "Expanding masked load is supported for 32 and 64-bit types only!");
23750 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23751 // VLX. These types for exp-loads are handled here.
23752 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23755 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23756 "Cannot lower masked load op.");
23758 assert((ScalarVT.getSizeInBits() >= 32 ||
23759 (Subtarget.hasBWI() &&
23760 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23761 "Unsupported masked load op.");
23763 // This operation is legal for targets with VLX, but without
23764 // VLX the vector should be widened to 512 bit
23765 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23766 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23767 SDValue Src0 = N->getSrc0();
23768 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23770 // Mask element has to be i1.
23771 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23772 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23773 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23775 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23777 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23778 if (MaskEltTy != MVT::i1)
23779 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23780 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23781 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23782 N->getBasePtr(), Mask, Src0,
23783 N->getMemoryVT(), N->getMemOperand(),
23784 N->getExtensionType(),
23785 N->isExpandingLoad());
23787 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23788 NewLoad.getValue(0),
23789 DAG.getIntPtrConstant(0, dl));
23790 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23791 return DAG.getMergeValues(RetOps, dl);
23794 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23795 SelectionDAG &DAG) {
23796 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23797 SDValue DataToStore = N->getValue();
23798 MVT VT = DataToStore.getSimpleValueType();
23799 MVT ScalarVT = VT.getScalarType();
23800 SDValue Mask = N->getMask();
23803 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23804 "Expanding masked load is supported on AVX-512 target only!");
23806 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23807 "Expanding masked load is supported for 32 and 64-bit types only!");
23809 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23810 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23813 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23814 "Cannot lower masked store op.");
23816 assert((ScalarVT.getSizeInBits() >= 32 ||
23817 (Subtarget.hasBWI() &&
23818 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23819 "Unsupported masked store op.");
23821 // This operation is legal for targets with VLX, but without
23822 // VLX the vector should be widened to 512 bit
23823 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23824 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23826 // Mask element has to be i1.
23827 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23828 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23829 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23831 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23833 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23834 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23835 if (MaskEltTy != MVT::i1)
23836 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23837 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23838 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23839 Mask, N->getMemoryVT(), N->getMemOperand(),
23840 N->isTruncatingStore(), N->isCompressingStore());
23843 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23844 SelectionDAG &DAG) {
23845 assert(Subtarget.hasAVX512() &&
23846 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23848 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23850 MVT VT = Op.getSimpleValueType();
23851 SDValue Index = N->getIndex();
23852 SDValue Mask = N->getMask();
23853 SDValue Src0 = N->getValue();
23854 MVT IndexVT = Index.getSimpleValueType();
23855 MVT MaskVT = Mask.getSimpleValueType();
23857 unsigned NumElts = VT.getVectorNumElements();
23858 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23860 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23861 !Index.getSimpleValueType().is512BitVector()) {
23862 // AVX512F supports only 512-bit vectors. Or data or index should
23863 // be 512 bit wide. If now the both index and data are 256-bit, but
23864 // the vector contains 8 elements, we just sign-extend the index
23865 if (NumElts == 8) {
23866 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23867 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23868 N->getOperand(3), Index };
23869 DAG.UpdateNodeOperands(N, Ops);
23873 // Minimal number of elements in Gather
23876 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23877 Index = ExtendToType(Index, NewIndexVT, DAG);
23878 if (IndexVT.getScalarType() == MVT::i32)
23879 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23882 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23883 // At this point we have promoted mask operand
23884 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23885 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23886 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23887 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23889 // The pass-through value
23890 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23891 Src0 = ExtendToType(Src0, NewVT, DAG);
23893 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23894 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23895 N->getMemoryVT(), dl, Ops,
23896 N->getMemOperand());
23897 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23898 NewGather.getValue(0),
23899 DAG.getIntPtrConstant(0, dl));
23900 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23901 return DAG.getMergeValues(RetOps, dl);
23903 if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
23904 // There is a special case when the return type is v2i32 is illegal and
23905 // the type legaizer extended it to v2i64. Without this conversion we end up
23906 // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23907 // In order to avoid this situation, we'll build an X86 specific Gather node
23908 // with index v2i64 and value type v4i32.
23909 assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
23910 "Unexpected type in masked gather");
23911 Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23912 DAG.getBitcast(MVT::v4i32, Src0),
23913 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23914 // The mask should match the destination type. Extending mask with zeroes
23915 // is not necessary since instruction itself reads only two values from
23917 Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23918 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23919 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23920 DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23921 N->getMemOperand());
23923 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23924 NewGather.getValue(0), DAG);
23925 SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23926 return DAG.getMergeValues(RetOps, dl);
23928 if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
23929 // This transformation is for optimization only.
23930 // The type legalizer extended mask and index to 4 elements vector
23931 // in order to match requirements of the common gather node - same
23932 // vector width of index and value. X86 Gather node allows mismatch
23933 // of vector width in order to select more optimal instruction at the
23935 assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
23936 "Unexpected type in masked gather");
23937 if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23938 ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23939 Index.getOpcode() == ISD::CONCAT_VECTORS &&
23940 Index.getOperand(1).isUndef()) {
23941 Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23942 Index = Index.getOperand(0);
23945 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23946 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23947 DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23948 N->getMemOperand());
23950 SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23951 return DAG.getMergeValues(RetOps, dl);
23957 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23958 SelectionDAG &DAG) const {
23959 // TODO: Eventually, the lowering of these nodes should be informed by or
23960 // deferred to the GC strategy for the function in which they appear. For
23961 // now, however, they must be lowered to something. Since they are logically
23962 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23963 // require special handling for these nodes), lower them as literal NOOPs for
23965 SmallVector<SDValue, 2> Ops;
23967 Ops.push_back(Op.getOperand(0));
23968 if (Op->getGluedNode())
23969 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23972 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23973 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23978 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
23979 SelectionDAG &DAG) const {
23980 // TODO: Eventually, the lowering of these nodes should be informed by or
23981 // deferred to the GC strategy for the function in which they appear. For
23982 // now, however, they must be lowered to something. Since they are logically
23983 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23984 // require special handling for these nodes), lower them as literal NOOPs for
23986 SmallVector<SDValue, 2> Ops;
23988 Ops.push_back(Op.getOperand(0));
23989 if (Op->getGluedNode())
23990 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
23993 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
23994 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
23999 /// Provide custom lowering hooks for some operations.
24000 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24001 switch (Op.getOpcode()) {
24002 default: llvm_unreachable("Should not custom lower this!");
24003 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24004 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24005 return LowerCMP_SWAP(Op, Subtarget, DAG);
24006 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24007 case ISD::ATOMIC_LOAD_ADD:
24008 case ISD::ATOMIC_LOAD_SUB:
24009 case ISD::ATOMIC_LOAD_OR:
24010 case ISD::ATOMIC_LOAD_XOR:
24011 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24012 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24013 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24014 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24015 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24016 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24017 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24018 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24019 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24020 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24021 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24022 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24023 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24024 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24025 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24026 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24027 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24028 case ISD::SHL_PARTS:
24029 case ISD::SRA_PARTS:
24030 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24031 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24032 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24033 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24034 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24035 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24036 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24037 case ISD::ZERO_EXTEND_VECTOR_INREG:
24038 case ISD::SIGN_EXTEND_VECTOR_INREG:
24039 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24040 case ISD::FP_TO_SINT:
24041 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24042 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24043 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24045 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24046 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24047 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24048 case ISD::SETCC: return LowerSETCC(Op, DAG);
24049 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24050 case ISD::SELECT: return LowerSELECT(Op, DAG);
24051 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24052 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24053 case ISD::VASTART: return LowerVASTART(Op, DAG);
24054 case ISD::VAARG: return LowerVAARG(Op, DAG);
24055 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24056 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
24057 case ISD::INTRINSIC_VOID:
24058 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24059 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24060 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24061 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24062 case ISD::FRAME_TO_ARGS_OFFSET:
24063 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24064 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24065 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24066 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24067 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24068 case ISD::EH_SJLJ_SETUP_DISPATCH:
24069 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24070 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24071 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24072 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24074 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24076 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24077 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24079 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24080 case ISD::UMUL_LOHI:
24081 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24083 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24086 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24092 case ISD::UMULO: return LowerXALUO(Op, DAG);
24093 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24094 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24095 case ISD::ADDCARRY:
24096 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24098 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24102 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24103 case ISD::ABS: return LowerABS(Op, DAG);
24104 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24105 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24106 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24107 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24108 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24109 case ISD::GC_TRANSITION_START:
24110 return LowerGC_TRANSITION_START(Op, DAG);
24111 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24112 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24116 /// Places new result values for the node in Results (their number
24117 /// and types must exactly match those of the original return values of
24118 /// the node), or leaves Results empty, which indicates that the node is not
24119 /// to be custom lowered after all.
24120 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24121 SmallVectorImpl<SDValue> &Results,
24122 SelectionDAG &DAG) const {
24123 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24125 if (!Res.getNode())
24128 assert((N->getNumValues() <= Res->getNumValues()) &&
24129 "Lowering returned the wrong number of results!");
24131 // Places new result values base on N result number.
24132 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24133 // than original node, chain should be dropped(last value).
24134 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24135 Results.push_back(Res.getValue(I));
24138 /// Replace a node with an illegal result type with a new node built out of
24140 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24141 SmallVectorImpl<SDValue>&Results,
24142 SelectionDAG &DAG) const {
24144 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24145 switch (N->getOpcode()) {
24147 llvm_unreachable("Do not know how to custom type legalize this operation!");
24148 case X86ISD::AVG: {
24149 // Legalize types for X86ISD::AVG by expanding vectors.
24150 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24152 auto InVT = N->getValueType(0);
24153 auto InVTSize = InVT.getSizeInBits();
24154 const unsigned RegSize =
24155 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24156 assert((Subtarget.hasBWI() || RegSize < 512) &&
24157 "512-bit vector requires AVX512BW");
24158 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24159 "256-bit vector requires AVX2");
24161 auto ElemVT = InVT.getVectorElementType();
24162 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24163 RegSize / ElemVT.getSizeInBits());
24164 assert(RegSize % InVT.getSizeInBits() == 0);
24165 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24167 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24168 Ops[0] = N->getOperand(0);
24169 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24170 Ops[0] = N->getOperand(1);
24171 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24173 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24174 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24175 DAG.getIntPtrConstant(0, dl)));
24178 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24179 case X86ISD::FMINC:
24181 case X86ISD::FMAXC:
24182 case X86ISD::FMAX: {
24183 EVT VT = N->getValueType(0);
24184 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24185 SDValue UNDEF = DAG.getUNDEF(VT);
24186 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24187 N->getOperand(0), UNDEF);
24188 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24189 N->getOperand(1), UNDEF);
24190 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24198 case ISD::UDIVREM: {
24199 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24200 Results.push_back(V);
24203 case ISD::FP_TO_SINT:
24204 case ISD::FP_TO_UINT: {
24205 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24207 if (N->getValueType(0) == MVT::v2i32) {
24208 assert((IsSigned || Subtarget.hasAVX512()) &&
24209 "Can only handle signed conversion without AVX512");
24210 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24211 SDValue Src = N->getOperand(0);
24212 if (Src.getValueType() == MVT::v2f64) {
24213 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24214 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24215 : X86ISD::CVTTP2UI,
24216 dl, MVT::v4i32, Src);
24217 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24218 Results.push_back(Res);
24221 if (Src.getValueType() == MVT::v2f32) {
24222 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24223 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24224 DAG.getUNDEF(MVT::v2f32));
24225 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24226 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24227 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24228 Results.push_back(Res);
24232 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24233 // so early out here.
24237 std::pair<SDValue,SDValue> Vals =
24238 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24239 SDValue FIST = Vals.first, StackSlot = Vals.second;
24240 if (FIST.getNode()) {
24241 EVT VT = N->getValueType(0);
24242 // Return a load from the stack slot.
24243 if (StackSlot.getNode())
24245 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24247 Results.push_back(FIST);
24251 case ISD::SINT_TO_FP: {
24252 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24253 SDValue Src = N->getOperand(0);
24254 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24256 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24259 case ISD::UINT_TO_FP: {
24260 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24261 EVT VT = N->getValueType(0);
24262 if (VT != MVT::v2f32)
24264 SDValue Src = N->getOperand(0);
24265 EVT SrcVT = Src.getValueType();
24266 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24267 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24270 if (SrcVT != MVT::v2i32)
24272 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24274 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24275 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24276 DAG.getBitcast(MVT::v2i64, VBias));
24277 Or = DAG.getBitcast(MVT::v2f64, Or);
24278 // TODO: Are there any fast-math-flags to propagate here?
24279 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24280 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24283 case ISD::FP_ROUND: {
24284 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24286 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24287 Results.push_back(V);
24290 case ISD::FP_EXTEND: {
24291 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24292 // No other ValueType for FP_EXTEND should reach this point.
24293 assert(N->getValueType(0) == MVT::v2f32 &&
24294 "Do not know how to legalize this Node");
24297 case ISD::INTRINSIC_W_CHAIN: {
24298 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24300 default : llvm_unreachable("Do not know how to custom type "
24301 "legalize this intrinsic operation!");
24302 case Intrinsic::x86_rdtsc:
24303 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24305 case Intrinsic::x86_rdtscp:
24306 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24308 case Intrinsic::x86_rdpmc:
24309 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24311 case Intrinsic::x86_xgetbv:
24312 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24315 case ISD::INTRINSIC_WO_CHAIN: {
24316 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24317 Results.push_back(V);
24320 case ISD::READCYCLECOUNTER: {
24321 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24324 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24325 EVT T = N->getValueType(0);
24326 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24327 bool Regs64bit = T == MVT::i128;
24328 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24329 SDValue cpInL, cpInH;
24330 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24331 DAG.getConstant(0, dl, HalfT));
24332 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24333 DAG.getConstant(1, dl, HalfT));
24334 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24335 Regs64bit ? X86::RAX : X86::EAX,
24337 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24338 Regs64bit ? X86::RDX : X86::EDX,
24339 cpInH, cpInL.getValue(1));
24340 SDValue swapInL, swapInH;
24341 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24342 DAG.getConstant(0, dl, HalfT));
24343 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24344 DAG.getConstant(1, dl, HalfT));
24346 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24347 swapInH, cpInH.getValue(1));
24348 // If the current function needs the base pointer, RBX,
24349 // we shouldn't use cmpxchg directly.
24350 // Indeed the lowering of that instruction will clobber
24351 // that register and since RBX will be a reserved register
24352 // the register allocator will not make sure its value will
24353 // be properly saved and restored around this live-range.
24354 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24356 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24357 unsigned BasePtr = TRI->getBaseRegister();
24358 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24359 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24360 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24361 // ISel prefers the LCMPXCHG64 variant.
24362 // If that assert breaks, that means it is not the case anymore,
24363 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24364 // not just EBX. This is a matter of accepting i64 input for that
24365 // pseudo, and restoring into the register of the right wide
24366 // in expand pseudo. Everything else should just work.
24367 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24368 "Saving only half of the RBX");
24369 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24370 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24371 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24372 Regs64bit ? X86::RBX : X86::EBX,
24373 HalfT, swapInH.getValue(1));
24374 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24376 /*Glue*/ RBXSave.getValue(2)};
24377 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24380 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24381 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24382 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24383 swapInH.getValue(1));
24384 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24385 swapInL.getValue(1)};
24386 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24388 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24389 Regs64bit ? X86::RAX : X86::EAX,
24390 HalfT, Result.getValue(1));
24391 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24392 Regs64bit ? X86::RDX : X86::EDX,
24393 HalfT, cpOutL.getValue(2));
24394 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24396 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24397 MVT::i32, cpOutH.getValue(2));
24398 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24399 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24401 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24402 Results.push_back(Success);
24403 Results.push_back(EFLAGS.getValue(1));
24406 case ISD::ATOMIC_SWAP:
24407 case ISD::ATOMIC_LOAD_ADD:
24408 case ISD::ATOMIC_LOAD_SUB:
24409 case ISD::ATOMIC_LOAD_AND:
24410 case ISD::ATOMIC_LOAD_OR:
24411 case ISD::ATOMIC_LOAD_XOR:
24412 case ISD::ATOMIC_LOAD_NAND:
24413 case ISD::ATOMIC_LOAD_MIN:
24414 case ISD::ATOMIC_LOAD_MAX:
24415 case ISD::ATOMIC_LOAD_UMIN:
24416 case ISD::ATOMIC_LOAD_UMAX:
24417 case ISD::ATOMIC_LOAD: {
24418 // Delegate to generic TypeLegalization. Situations we can really handle
24419 // should have already been dealt with by AtomicExpandPass.cpp.
24422 case ISD::BITCAST: {
24423 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24424 EVT DstVT = N->getValueType(0);
24425 EVT SrcVT = N->getOperand(0)->getValueType(0);
24427 if (SrcVT != MVT::f64 ||
24428 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24431 unsigned NumElts = DstVT.getVectorNumElements();
24432 EVT SVT = DstVT.getVectorElementType();
24433 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24434 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24435 MVT::v2f64, N->getOperand(0));
24436 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24438 if (ExperimentalVectorWideningLegalization) {
24439 // If we are legalizing vectors by widening, we already have the desired
24440 // legal vector type, just return it.
24441 Results.push_back(ToVecInt);
24445 SmallVector<SDValue, 8> Elts;
24446 for (unsigned i = 0, e = NumElts; i != e; ++i)
24447 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24448 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24450 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24455 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24456 switch ((X86ISD::NodeType)Opcode) {
24457 case X86ISD::FIRST_NUMBER: break;
24458 case X86ISD::BSF: return "X86ISD::BSF";
24459 case X86ISD::BSR: return "X86ISD::BSR";
24460 case X86ISD::SHLD: return "X86ISD::SHLD";
24461 case X86ISD::SHRD: return "X86ISD::SHRD";
24462 case X86ISD::FAND: return "X86ISD::FAND";
24463 case X86ISD::FANDN: return "X86ISD::FANDN";
24464 case X86ISD::FOR: return "X86ISD::FOR";
24465 case X86ISD::FXOR: return "X86ISD::FXOR";
24466 case X86ISD::FILD: return "X86ISD::FILD";
24467 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24468 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24469 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24470 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24471 case X86ISD::FLD: return "X86ISD::FLD";
24472 case X86ISD::FST: return "X86ISD::FST";
24473 case X86ISD::CALL: return "X86ISD::CALL";
24474 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24475 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24476 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24477 case X86ISD::BT: return "X86ISD::BT";
24478 case X86ISD::CMP: return "X86ISD::CMP";
24479 case X86ISD::COMI: return "X86ISD::COMI";
24480 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24481 case X86ISD::CMPM: return "X86ISD::CMPM";
24482 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24483 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24484 case X86ISD::SETCC: return "X86ISD::SETCC";
24485 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24486 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24487 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24488 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24489 case X86ISD::CMOV: return "X86ISD::CMOV";
24490 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24491 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24492 case X86ISD::IRET: return "X86ISD::IRET";
24493 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24494 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24495 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24496 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24497 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24498 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24499 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24500 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24501 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24502 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24503 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24504 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24505 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24506 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24507 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24508 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24509 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24510 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24511 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24512 case X86ISD::HADD: return "X86ISD::HADD";
24513 case X86ISD::HSUB: return "X86ISD::HSUB";
24514 case X86ISD::FHADD: return "X86ISD::FHADD";
24515 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24516 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24517 case X86ISD::FMAX: return "X86ISD::FMAX";
24518 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24519 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24520 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24521 case X86ISD::FMIN: return "X86ISD::FMIN";
24522 case X86ISD::FMINS: return "X86ISD::FMINS";
24523 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24524 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24525 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24526 case X86ISD::FMINC: return "X86ISD::FMINC";
24527 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24528 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24529 case X86ISD::FRCP: return "X86ISD::FRCP";
24530 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24531 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24532 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24533 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24534 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24535 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24536 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24537 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24538 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24539 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24540 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24541 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24542 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24543 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24544 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24545 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24546 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24547 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24548 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24549 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24550 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24551 case X86ISD::LADD: return "X86ISD::LADD";
24552 case X86ISD::LSUB: return "X86ISD::LSUB";
24553 case X86ISD::LOR: return "X86ISD::LOR";
24554 case X86ISD::LXOR: return "X86ISD::LXOR";
24555 case X86ISD::LAND: return "X86ISD::LAND";
24556 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24557 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24558 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24559 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24560 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24561 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24562 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24563 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24564 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24565 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24566 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24567 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24568 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24569 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24570 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24571 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24572 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24573 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24574 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24575 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24576 case X86ISD::VSHL: return "X86ISD::VSHL";
24577 case X86ISD::VSRL: return "X86ISD::VSRL";
24578 case X86ISD::VSRA: return "X86ISD::VSRA";
24579 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24580 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24581 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24582 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24583 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24584 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24585 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24586 case X86ISD::CMPP: return "X86ISD::CMPP";
24587 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24588 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24589 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24590 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24591 case X86ISD::ADD: return "X86ISD::ADD";
24592 case X86ISD::SUB: return "X86ISD::SUB";
24593 case X86ISD::ADC: return "X86ISD::ADC";
24594 case X86ISD::SBB: return "X86ISD::SBB";
24595 case X86ISD::SMUL: return "X86ISD::SMUL";
24596 case X86ISD::UMUL: return "X86ISD::UMUL";
24597 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24598 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24599 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24600 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24601 case X86ISD::INC: return "X86ISD::INC";
24602 case X86ISD::DEC: return "X86ISD::DEC";
24603 case X86ISD::OR: return "X86ISD::OR";
24604 case X86ISD::XOR: return "X86ISD::XOR";
24605 case X86ISD::AND: return "X86ISD::AND";
24606 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24607 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24608 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24609 case X86ISD::PTEST: return "X86ISD::PTEST";
24610 case X86ISD::TESTP: return "X86ISD::TESTP";
24611 case X86ISD::TESTM: return "X86ISD::TESTM";
24612 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24613 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24614 case X86ISD::KTEST: return "X86ISD::KTEST";
24615 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24616 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24617 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24618 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24619 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24620 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24621 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24622 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24623 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24624 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24625 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24626 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24627 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24628 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24629 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24630 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24631 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24632 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24633 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24634 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24635 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24636 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24637 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24638 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24639 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24640 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24641 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24642 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24643 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24644 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24645 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24646 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24647 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24648 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24649 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24650 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24651 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24652 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24653 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24654 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24655 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24656 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24657 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24658 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24659 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24660 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24661 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24662 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24663 case X86ISD::SAHF: return "X86ISD::SAHF";
24664 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24665 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24666 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24667 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24668 case X86ISD::VPROT: return "X86ISD::VPROT";
24669 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24670 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24671 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24672 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24673 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24674 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24675 case X86ISD::FMADD: return "X86ISD::FMADD";
24676 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24677 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24678 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24679 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24680 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24681 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24682 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24683 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24684 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24685 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24686 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24687 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24688 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24689 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24690 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24691 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24692 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24693 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24694 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24695 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24696 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24697 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24698 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24699 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24700 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24701 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24702 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24703 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24704 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24705 case X86ISD::XTEST: return "X86ISD::XTEST";
24706 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24707 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24708 case X86ISD::SELECT: return "X86ISD::SELECT";
24709 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24710 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24711 case X86ISD::RCP28: return "X86ISD::RCP28";
24712 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24713 case X86ISD::EXP2: return "X86ISD::EXP2";
24714 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24715 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24716 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24717 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24718 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24719 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24720 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24721 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24722 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24723 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24724 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24725 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24726 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24727 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24728 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24729 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24730 case X86ISD::ADDS: return "X86ISD::ADDS";
24731 case X86ISD::SUBS: return "X86ISD::SUBS";
24732 case X86ISD::AVG: return "X86ISD::AVG";
24733 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24734 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24735 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24736 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24737 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24738 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24739 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24740 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24741 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24742 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24743 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24744 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24745 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24746 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24747 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24748 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24749 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24750 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24751 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24752 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24753 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24754 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24755 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24756 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24757 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24758 case X86ISD::MGATHER: return "X86ISD::MGATHER";
24763 /// Return true if the addressing mode represented by AM is legal for this
24764 /// target, for a load/store of the specified type.
24765 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24766 const AddrMode &AM, Type *Ty,
24767 unsigned AS) const {
24768 // X86 supports extremely general addressing modes.
24769 CodeModel::Model M = getTargetMachine().getCodeModel();
24771 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24772 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24776 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24778 // If a reference to this global requires an extra load, we can't fold it.
24779 if (isGlobalStubReference(GVFlags))
24782 // If BaseGV requires a register for the PIC base, we cannot also have a
24783 // BaseReg specified.
24784 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24787 // If lower 4G is not available, then we must use rip-relative addressing.
24788 if ((M != CodeModel::Small || isPositionIndependent()) &&
24789 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24793 switch (AM.Scale) {
24799 // These scales always work.
24804 // These scales are formed with basereg+scalereg. Only accept if there is
24809 default: // Other stuff never works.
24816 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24817 unsigned Bits = Ty->getScalarSizeInBits();
24819 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24820 // particularly cheaper than those without.
24824 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24825 // variable shifts just as cheap as scalar ones.
24826 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24829 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24830 // fully general vector.
24834 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24835 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24837 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24838 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24839 return NumBits1 > NumBits2;
24842 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24843 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24846 if (!isTypeLegal(EVT::getEVT(Ty1)))
24849 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24851 // Assuming the caller doesn't have a zeroext or signext return parameter,
24852 // truncation all the way down to i1 is valid.
24856 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24857 return isInt<32>(Imm);
24860 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24861 // Can also use sub to handle negated immediates.
24862 return isInt<32>(Imm);
24865 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24866 if (!VT1.isInteger() || !VT2.isInteger())
24868 unsigned NumBits1 = VT1.getSizeInBits();
24869 unsigned NumBits2 = VT2.getSizeInBits();
24870 return NumBits1 > NumBits2;
24873 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24874 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24875 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24878 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24879 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24880 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24883 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24884 EVT VT1 = Val.getValueType();
24885 if (isZExtFree(VT1, VT2))
24888 if (Val.getOpcode() != ISD::LOAD)
24891 if (!VT1.isSimple() || !VT1.isInteger() ||
24892 !VT2.isSimple() || !VT2.isInteger())
24895 switch (VT1.getSimpleVT().SimpleTy) {
24900 // X86 has 8, 16, and 32-bit zero-extending loads.
24907 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24910 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24911 if (!Subtarget.hasAnyFMA())
24914 VT = VT.getScalarType();
24916 if (!VT.isSimple())
24919 switch (VT.getSimpleVT().SimpleTy) {
24930 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24931 // i16 instructions are longer (0x66 prefix) and potentially slower.
24932 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24935 /// Targets can use this to indicate that they only support *some*
24936 /// VECTOR_SHUFFLE operations, those with specific masks.
24937 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24938 /// are assumed to be legal.
24940 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24942 if (!VT.isSimple())
24945 // Not for i1 vectors
24946 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24949 // Very little shuffling can be done for 64-bit vectors right now.
24950 if (VT.getSimpleVT().getSizeInBits() == 64)
24953 // We only care that the types being shuffled are legal. The lowering can
24954 // handle any possible shuffle mask that results.
24955 return isTypeLegal(VT.getSimpleVT());
24959 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24961 // Just delegate to the generic legality, clear masks aren't special.
24962 return isShuffleMaskLegal(Mask, VT);
24965 //===----------------------------------------------------------------------===//
24966 // X86 Scheduler Hooks
24967 //===----------------------------------------------------------------------===//
24969 /// Utility function to emit xbegin specifying the start of an RTM region.
24970 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
24971 const TargetInstrInfo *TII) {
24972 DebugLoc DL = MI.getDebugLoc();
24974 const BasicBlock *BB = MBB->getBasicBlock();
24975 MachineFunction::iterator I = ++MBB->getIterator();
24977 // For the v = xbegin(), we generate
24986 // eax = # XABORT_DEF
24990 // v = phi(s0/mainBB, s1/fallBB)
24992 MachineBasicBlock *thisMBB = MBB;
24993 MachineFunction *MF = MBB->getParent();
24994 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
24995 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
24996 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
24997 MF->insert(I, mainMBB);
24998 MF->insert(I, fallMBB);
24999 MF->insert(I, sinkMBB);
25001 // Transfer the remainder of BB and its successor edges to sinkMBB.
25002 sinkMBB->splice(sinkMBB->begin(), MBB,
25003 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25004 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25006 MachineRegisterInfo &MRI = MF->getRegInfo();
25007 unsigned DstReg = MI.getOperand(0).getReg();
25008 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25009 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25010 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25014 // # fallthrough to mainMBB
25015 // # abortion to fallMBB
25016 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25017 thisMBB->addSuccessor(mainMBB);
25018 thisMBB->addSuccessor(fallMBB);
25021 // mainDstReg := -1
25022 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25023 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25024 mainMBB->addSuccessor(sinkMBB);
25027 // ; pseudo instruction to model hardware's definition from XABORT
25028 // EAX := XABORT_DEF
25029 // fallDstReg := EAX
25030 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25031 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25033 fallMBB->addSuccessor(sinkMBB);
25036 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25037 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25038 .addReg(mainDstReg).addMBB(mainMBB)
25039 .addReg(fallDstReg).addMBB(fallMBB);
25041 MI.eraseFromParent();
25045 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25046 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25047 // in the .td file.
25048 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25049 const TargetInstrInfo *TII) {
25051 switch (MI.getOpcode()) {
25052 default: llvm_unreachable("illegal opcode!");
25053 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25054 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25055 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25056 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25057 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25058 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25059 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25060 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25063 DebugLoc dl = MI.getDebugLoc();
25064 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25066 unsigned NumArgs = MI.getNumOperands();
25067 for (unsigned i = 1; i < NumArgs; ++i) {
25068 MachineOperand &Op = MI.getOperand(i);
25069 if (!(Op.isReg() && Op.isImplicit()))
25072 if (MI.hasOneMemOperand())
25073 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25075 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25076 .addReg(X86::XMM0);
25078 MI.eraseFromParent();
25082 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25083 // defs in an instruction pattern
25084 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25085 const TargetInstrInfo *TII) {
25087 switch (MI.getOpcode()) {
25088 default: llvm_unreachable("illegal opcode!");
25089 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25090 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25091 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25092 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25093 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25094 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25095 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25096 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25099 DebugLoc dl = MI.getDebugLoc();
25100 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25102 unsigned NumArgs = MI.getNumOperands(); // remove the results
25103 for (unsigned i = 1; i < NumArgs; ++i) {
25104 MachineOperand &Op = MI.getOperand(i);
25105 if (!(Op.isReg() && Op.isImplicit()))
25108 if (MI.hasOneMemOperand())
25109 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25111 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25114 MI.eraseFromParent();
25118 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25119 const X86Subtarget &Subtarget) {
25120 DebugLoc dl = MI.getDebugLoc();
25121 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25123 // insert input VAL into EAX
25124 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25125 .addReg(MI.getOperand(0).getReg());
25126 // insert zero to ECX
25127 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25129 // insert zero to EDX
25130 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25132 // insert WRPKRU instruction
25133 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25135 MI.eraseFromParent(); // The pseudo is gone now.
25139 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25140 const X86Subtarget &Subtarget) {
25141 DebugLoc dl = MI.getDebugLoc();
25142 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25144 // insert zero to ECX
25145 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25147 // insert RDPKRU instruction
25148 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25149 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25152 MI.eraseFromParent(); // The pseudo is gone now.
25156 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25157 const X86Subtarget &Subtarget,
25159 DebugLoc dl = MI.getDebugLoc();
25160 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25161 // Address into RAX/EAX, other two args into ECX, EDX.
25162 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25163 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25164 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25165 for (int i = 0; i < X86::AddrNumOperands; ++i)
25166 MIB.add(MI.getOperand(i));
25168 unsigned ValOps = X86::AddrNumOperands;
25169 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25170 .addReg(MI.getOperand(ValOps).getReg());
25171 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25172 .addReg(MI.getOperand(ValOps + 1).getReg());
25174 // The instruction doesn't actually take any operands though.
25175 BuildMI(*BB, MI, dl, TII->get(Opc));
25177 MI.eraseFromParent(); // The pseudo is gone now.
25181 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25182 const X86Subtarget &Subtarget) {
25183 DebugLoc dl = MI->getDebugLoc();
25184 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25185 // Address into RAX/EAX
25186 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25187 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25188 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25189 for (int i = 0; i < X86::AddrNumOperands; ++i)
25190 MIB.add(MI->getOperand(i));
25192 // The instruction doesn't actually take any operands though.
25193 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25195 MI->eraseFromParent(); // The pseudo is gone now.
25201 MachineBasicBlock *
25202 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25203 MachineBasicBlock *MBB) const {
25204 // Emit va_arg instruction on X86-64.
25206 // Operands to this pseudo-instruction:
25207 // 0 ) Output : destination address (reg)
25208 // 1-5) Input : va_list address (addr, i64mem)
25209 // 6 ) ArgSize : Size (in bytes) of vararg type
25210 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25211 // 8 ) Align : Alignment of type
25212 // 9 ) EFLAGS (implicit-def)
25214 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25215 static_assert(X86::AddrNumOperands == 5,
25216 "VAARG_64 assumes 5 address operands");
25218 unsigned DestReg = MI.getOperand(0).getReg();
25219 MachineOperand &Base = MI.getOperand(1);
25220 MachineOperand &Scale = MI.getOperand(2);
25221 MachineOperand &Index = MI.getOperand(3);
25222 MachineOperand &Disp = MI.getOperand(4);
25223 MachineOperand &Segment = MI.getOperand(5);
25224 unsigned ArgSize = MI.getOperand(6).getImm();
25225 unsigned ArgMode = MI.getOperand(7).getImm();
25226 unsigned Align = MI.getOperand(8).getImm();
25228 // Memory Reference
25229 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25230 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25231 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25233 // Machine Information
25234 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25235 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25236 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25237 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25238 DebugLoc DL = MI.getDebugLoc();
25240 // struct va_list {
25243 // i64 overflow_area (address)
25244 // i64 reg_save_area (address)
25246 // sizeof(va_list) = 24
25247 // alignment(va_list) = 8
25249 unsigned TotalNumIntRegs = 6;
25250 unsigned TotalNumXMMRegs = 8;
25251 bool UseGPOffset = (ArgMode == 1);
25252 bool UseFPOffset = (ArgMode == 2);
25253 unsigned MaxOffset = TotalNumIntRegs * 8 +
25254 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25256 /* Align ArgSize to a multiple of 8 */
25257 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25258 bool NeedsAlign = (Align > 8);
25260 MachineBasicBlock *thisMBB = MBB;
25261 MachineBasicBlock *overflowMBB;
25262 MachineBasicBlock *offsetMBB;
25263 MachineBasicBlock *endMBB;
25265 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25266 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25267 unsigned OffsetReg = 0;
25269 if (!UseGPOffset && !UseFPOffset) {
25270 // If we only pull from the overflow region, we don't create a branch.
25271 // We don't need to alter control flow.
25272 OffsetDestReg = 0; // unused
25273 OverflowDestReg = DestReg;
25275 offsetMBB = nullptr;
25276 overflowMBB = thisMBB;
25279 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25280 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25281 // If not, pull from overflow_area. (branch to overflowMBB)
25286 // offsetMBB overflowMBB
25291 // Registers for the PHI in endMBB
25292 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25293 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25295 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25296 MachineFunction *MF = MBB->getParent();
25297 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25298 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25299 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25301 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25303 // Insert the new basic blocks
25304 MF->insert(MBBIter, offsetMBB);
25305 MF->insert(MBBIter, overflowMBB);
25306 MF->insert(MBBIter, endMBB);
25308 // Transfer the remainder of MBB and its successor edges to endMBB.
25309 endMBB->splice(endMBB->begin(), thisMBB,
25310 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25311 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25313 // Make offsetMBB and overflowMBB successors of thisMBB
25314 thisMBB->addSuccessor(offsetMBB);
25315 thisMBB->addSuccessor(overflowMBB);
25317 // endMBB is a successor of both offsetMBB and overflowMBB
25318 offsetMBB->addSuccessor(endMBB);
25319 overflowMBB->addSuccessor(endMBB);
25321 // Load the offset value into a register
25322 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25323 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25327 .addDisp(Disp, UseFPOffset ? 4 : 0)
25329 .setMemRefs(MMOBegin, MMOEnd);
25331 // Check if there is enough room left to pull this argument.
25332 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25334 .addImm(MaxOffset + 8 - ArgSizeA8);
25336 // Branch to "overflowMBB" if offset >= max
25337 // Fall through to "offsetMBB" otherwise
25338 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25339 .addMBB(overflowMBB);
25342 // In offsetMBB, emit code to use the reg_save_area.
25344 assert(OffsetReg != 0);
25346 // Read the reg_save_area address.
25347 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25348 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25354 .setMemRefs(MMOBegin, MMOEnd);
25356 // Zero-extend the offset
25357 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25358 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25361 .addImm(X86::sub_32bit);
25363 // Add the offset to the reg_save_area to get the final address.
25364 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25365 .addReg(OffsetReg64)
25366 .addReg(RegSaveReg);
25368 // Compute the offset for the next argument
25369 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25370 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25372 .addImm(UseFPOffset ? 16 : 8);
25374 // Store it back into the va_list.
25375 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25379 .addDisp(Disp, UseFPOffset ? 4 : 0)
25381 .addReg(NextOffsetReg)
25382 .setMemRefs(MMOBegin, MMOEnd);
25385 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25390 // Emit code to use overflow area
25393 // Load the overflow_area address into a register.
25394 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25395 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25401 .setMemRefs(MMOBegin, MMOEnd);
25403 // If we need to align it, do so. Otherwise, just copy the address
25404 // to OverflowDestReg.
25406 // Align the overflow address
25407 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25408 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25410 // aligned_addr = (addr + (align-1)) & ~(align-1)
25411 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25412 .addReg(OverflowAddrReg)
25415 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25417 .addImm(~(uint64_t)(Align-1));
25419 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25420 .addReg(OverflowAddrReg);
25423 // Compute the next overflow address after this argument.
25424 // (the overflow address should be kept 8-byte aligned)
25425 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25426 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25427 .addReg(OverflowDestReg)
25428 .addImm(ArgSizeA8);
25430 // Store the new overflow address.
25431 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25437 .addReg(NextAddrReg)
25438 .setMemRefs(MMOBegin, MMOEnd);
25440 // If we branched, emit the PHI to the front of endMBB.
25442 BuildMI(*endMBB, endMBB->begin(), DL,
25443 TII->get(X86::PHI), DestReg)
25444 .addReg(OffsetDestReg).addMBB(offsetMBB)
25445 .addReg(OverflowDestReg).addMBB(overflowMBB);
25448 // Erase the pseudo instruction
25449 MI.eraseFromParent();
25454 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25455 MachineInstr &MI, MachineBasicBlock *MBB) const {
25456 // Emit code to save XMM registers to the stack. The ABI says that the
25457 // number of registers to save is given in %al, so it's theoretically
25458 // possible to do an indirect jump trick to avoid saving all of them,
25459 // however this code takes a simpler approach and just executes all
25460 // of the stores if %al is non-zero. It's less code, and it's probably
25461 // easier on the hardware branch predictor, and stores aren't all that
25462 // expensive anyway.
25464 // Create the new basic blocks. One block contains all the XMM stores,
25465 // and one block is the final destination regardless of whether any
25466 // stores were performed.
25467 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25468 MachineFunction *F = MBB->getParent();
25469 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25470 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25471 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25472 F->insert(MBBIter, XMMSaveMBB);
25473 F->insert(MBBIter, EndMBB);
25475 // Transfer the remainder of MBB and its successor edges to EndMBB.
25476 EndMBB->splice(EndMBB->begin(), MBB,
25477 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25478 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25480 // The original block will now fall through to the XMM save block.
25481 MBB->addSuccessor(XMMSaveMBB);
25482 // The XMMSaveMBB will fall through to the end block.
25483 XMMSaveMBB->addSuccessor(EndMBB);
25485 // Now add the instructions.
25486 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25487 DebugLoc DL = MI.getDebugLoc();
25489 unsigned CountReg = MI.getOperand(0).getReg();
25490 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25491 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25493 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25494 // If %al is 0, branch around the XMM save block.
25495 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25496 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25497 MBB->addSuccessor(EndMBB);
25500 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25501 // that was just emitted, but clearly shouldn't be "saved".
25502 assert((MI.getNumOperands() <= 3 ||
25503 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25504 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25505 "Expected last argument to be EFLAGS");
25506 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25507 // In the XMM save block, save all the XMM argument registers.
25508 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25509 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25510 MachineMemOperand *MMO = F->getMachineMemOperand(
25511 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25512 MachineMemOperand::MOStore,
25513 /*Size=*/16, /*Align=*/16);
25514 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25515 .addFrameIndex(RegSaveFrameIndex)
25516 .addImm(/*Scale=*/1)
25517 .addReg(/*IndexReg=*/0)
25518 .addImm(/*Disp=*/Offset)
25519 .addReg(/*Segment=*/0)
25520 .addReg(MI.getOperand(i).getReg())
25521 .addMemOperand(MMO);
25524 MI.eraseFromParent(); // The pseudo instruction is gone now.
25529 // The EFLAGS operand of SelectItr might be missing a kill marker
25530 // because there were multiple uses of EFLAGS, and ISel didn't know
25531 // which to mark. Figure out whether SelectItr should have had a
25532 // kill marker, and set it if it should. Returns the correct kill
25534 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25535 MachineBasicBlock* BB,
25536 const TargetRegisterInfo* TRI) {
25537 // Scan forward through BB for a use/def of EFLAGS.
25538 MachineBasicBlock::iterator miI(std::next(SelectItr));
25539 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25540 const MachineInstr& mi = *miI;
25541 if (mi.readsRegister(X86::EFLAGS))
25543 if (mi.definesRegister(X86::EFLAGS))
25544 break; // Should have kill-flag - update below.
25547 // If we hit the end of the block, check whether EFLAGS is live into a
25549 if (miI == BB->end()) {
25550 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25551 sEnd = BB->succ_end();
25552 sItr != sEnd; ++sItr) {
25553 MachineBasicBlock* succ = *sItr;
25554 if (succ->isLiveIn(X86::EFLAGS))
25559 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25560 // out. SelectMI should have a kill flag on EFLAGS.
25561 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25565 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25566 // together with other CMOV pseudo-opcodes into a single basic-block with
25567 // conditional jump around it.
25568 static bool isCMOVPseudo(MachineInstr &MI) {
25569 switch (MI.getOpcode()) {
25570 case X86::CMOV_FR32:
25571 case X86::CMOV_FR64:
25572 case X86::CMOV_GR8:
25573 case X86::CMOV_GR16:
25574 case X86::CMOV_GR32:
25575 case X86::CMOV_RFP32:
25576 case X86::CMOV_RFP64:
25577 case X86::CMOV_RFP80:
25578 case X86::CMOV_V2F64:
25579 case X86::CMOV_V2I64:
25580 case X86::CMOV_V4F32:
25581 case X86::CMOV_V4F64:
25582 case X86::CMOV_V4I64:
25583 case X86::CMOV_V16F32:
25584 case X86::CMOV_V8F32:
25585 case X86::CMOV_V8F64:
25586 case X86::CMOV_V8I64:
25587 case X86::CMOV_V8I1:
25588 case X86::CMOV_V16I1:
25589 case X86::CMOV_V32I1:
25590 case X86::CMOV_V64I1:
25598 MachineBasicBlock *
25599 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25600 MachineBasicBlock *BB) const {
25601 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25602 DebugLoc DL = MI.getDebugLoc();
25604 // To "insert" a SELECT_CC instruction, we actually have to insert the
25605 // diamond control-flow pattern. The incoming instruction knows the
25606 // destination vreg to set, the condition code register to branch on, the
25607 // true/false values to select between, and a branch opcode to use.
25608 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25609 MachineFunction::iterator It = ++BB->getIterator();
25614 // cmpTY ccX, r1, r2
25616 // fallthrough --> copy0MBB
25617 MachineBasicBlock *thisMBB = BB;
25618 MachineFunction *F = BB->getParent();
25620 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25621 // as described above, by inserting a BB, and then making a PHI at the join
25622 // point to select the true and false operands of the CMOV in the PHI.
25624 // The code also handles two different cases of multiple CMOV opcodes
25628 // In this case, there are multiple CMOVs in a row, all which are based on
25629 // the same condition setting (or the exact opposite condition setting).
25630 // In this case we can lower all the CMOVs using a single inserted BB, and
25631 // then make a number of PHIs at the join point to model the CMOVs. The only
25632 // trickiness here, is that in a case like:
25634 // t2 = CMOV cond1 t1, f1
25635 // t3 = CMOV cond1 t2, f2
25637 // when rewriting this into PHIs, we have to perform some renaming on the
25638 // temps since you cannot have a PHI operand refer to a PHI result earlier
25639 // in the same block. The "simple" but wrong lowering would be:
25641 // t2 = PHI t1(BB1), f1(BB2)
25642 // t3 = PHI t2(BB1), f2(BB2)
25644 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25645 // renaming is to note that on the path through BB1, t2 is really just a
25646 // copy of t1, and do that renaming, properly generating:
25648 // t2 = PHI t1(BB1), f1(BB2)
25649 // t3 = PHI t1(BB1), f2(BB2)
25651 // Case 2, we lower cascaded CMOVs such as
25653 // (CMOV (CMOV F, T, cc1), T, cc2)
25655 // to two successive branches. For that, we look for another CMOV as the
25656 // following instruction.
25658 // Without this, we would add a PHI between the two jumps, which ends up
25659 // creating a few copies all around. For instance, for
25661 // (sitofp (zext (fcmp une)))
25663 // we would generate:
25665 // ucomiss %xmm1, %xmm0
25666 // movss <1.0f>, %xmm0
25667 // movaps %xmm0, %xmm1
25669 // xorps %xmm1, %xmm1
25672 // movaps %xmm1, %xmm0
25676 // because this custom-inserter would have generated:
25688 // A: X = ...; Y = ...
25690 // C: Z = PHI [X, A], [Y, B]
25692 // E: PHI [X, C], [Z, D]
25694 // If we lower both CMOVs in a single step, we can instead generate:
25706 // A: X = ...; Y = ...
25708 // E: PHI [X, A], [X, C], [Y, D]
25710 // Which, in our sitofp/fcmp example, gives us something like:
25712 // ucomiss %xmm1, %xmm0
25713 // movss <1.0f>, %xmm0
25716 // xorps %xmm0, %xmm0
25720 MachineInstr *CascadedCMOV = nullptr;
25721 MachineInstr *LastCMOV = &MI;
25722 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25723 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25724 MachineBasicBlock::iterator NextMIIt =
25725 std::next(MachineBasicBlock::iterator(MI));
25727 // Check for case 1, where there are multiple CMOVs with the same condition
25728 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25729 // number of jumps the most.
25731 if (isCMOVPseudo(MI)) {
25732 // See if we have a string of CMOVS with the same condition.
25733 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25734 (NextMIIt->getOperand(3).getImm() == CC ||
25735 NextMIIt->getOperand(3).getImm() == OppCC)) {
25736 LastCMOV = &*NextMIIt;
25741 // This checks for case 2, but only do this if we didn't already find
25742 // case 1, as indicated by LastCMOV == MI.
25743 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25744 NextMIIt->getOpcode() == MI.getOpcode() &&
25745 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25746 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25747 NextMIIt->getOperand(1).isKill()) {
25748 CascadedCMOV = &*NextMIIt;
25751 MachineBasicBlock *jcc1MBB = nullptr;
25753 // If we have a cascaded CMOV, we lower it to two successive branches to
25754 // the same block. EFLAGS is used by both, so mark it as live in the second.
25755 if (CascadedCMOV) {
25756 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25757 F->insert(It, jcc1MBB);
25758 jcc1MBB->addLiveIn(X86::EFLAGS);
25761 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25762 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25763 F->insert(It, copy0MBB);
25764 F->insert(It, sinkMBB);
25766 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25767 // live into the sink and copy blocks.
25768 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25770 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25771 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25772 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25773 copy0MBB->addLiveIn(X86::EFLAGS);
25774 sinkMBB->addLiveIn(X86::EFLAGS);
25777 // Transfer the remainder of BB and its successor edges to sinkMBB.
25778 sinkMBB->splice(sinkMBB->begin(), BB,
25779 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25780 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25782 // Add the true and fallthrough blocks as its successors.
25783 if (CascadedCMOV) {
25784 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25785 BB->addSuccessor(jcc1MBB);
25787 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25788 // jump to the sinkMBB.
25789 jcc1MBB->addSuccessor(copy0MBB);
25790 jcc1MBB->addSuccessor(sinkMBB);
25792 BB->addSuccessor(copy0MBB);
25795 // The true block target of the first (or only) branch is always sinkMBB.
25796 BB->addSuccessor(sinkMBB);
25798 // Create the conditional branch instruction.
25799 unsigned Opc = X86::GetCondBranchFromCond(CC);
25800 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25802 if (CascadedCMOV) {
25803 unsigned Opc2 = X86::GetCondBranchFromCond(
25804 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25805 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25809 // %FalseValue = ...
25810 // # fallthrough to sinkMBB
25811 copy0MBB->addSuccessor(sinkMBB);
25814 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25816 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25817 MachineBasicBlock::iterator MIItEnd =
25818 std::next(MachineBasicBlock::iterator(LastCMOV));
25819 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25820 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25821 MachineInstrBuilder MIB;
25823 // As we are creating the PHIs, we have to be careful if there is more than
25824 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25825 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25826 // That also means that PHI construction must work forward from earlier to
25827 // later, and that the code must maintain a mapping from earlier PHI's
25828 // destination registers, and the registers that went into the PHI.
25830 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25831 unsigned DestReg = MIIt->getOperand(0).getReg();
25832 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25833 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25835 // If this CMOV we are generating is the opposite condition from
25836 // the jump we generated, then we have to swap the operands for the
25837 // PHI that is going to be generated.
25838 if (MIIt->getOperand(3).getImm() == OppCC)
25839 std::swap(Op1Reg, Op2Reg);
25841 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25842 Op1Reg = RegRewriteTable[Op1Reg].first;
25844 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25845 Op2Reg = RegRewriteTable[Op2Reg].second;
25847 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25848 TII->get(X86::PHI), DestReg)
25849 .addReg(Op1Reg).addMBB(copy0MBB)
25850 .addReg(Op2Reg).addMBB(thisMBB);
25852 // Add this PHI to the rewrite table.
25853 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25856 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25857 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25858 if (CascadedCMOV) {
25859 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25860 // Copy the PHI result to the register defined by the second CMOV.
25861 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25862 DL, TII->get(TargetOpcode::COPY),
25863 CascadedCMOV->getOperand(0).getReg())
25864 .addReg(MI.getOperand(0).getReg());
25865 CascadedCMOV->eraseFromParent();
25868 // Now remove the CMOV(s).
25869 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25870 (MIIt++)->eraseFromParent();
25875 MachineBasicBlock *
25876 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25877 MachineBasicBlock *BB) const {
25878 // Combine the following atomic floating-point modification pattern:
25879 // a.store(reg OP a.load(acquire), release)
25880 // Transform them into:
25881 // OPss (%gpr), %xmm
25882 // movss %xmm, (%gpr)
25883 // Or sd equivalent for 64-bit operations.
25885 switch (MI.getOpcode()) {
25886 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25887 case X86::RELEASE_FADD32mr:
25888 FOp = X86::ADDSSrm;
25889 MOp = X86::MOVSSmr;
25891 case X86::RELEASE_FADD64mr:
25892 FOp = X86::ADDSDrm;
25893 MOp = X86::MOVSDmr;
25896 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25897 DebugLoc DL = MI.getDebugLoc();
25898 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25899 unsigned ValOpIdx = X86::AddrNumOperands;
25900 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25901 MachineInstrBuilder MIB =
25902 BuildMI(*BB, MI, DL, TII->get(FOp),
25903 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25905 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25906 MachineOperand &Operand = MI.getOperand(i);
25907 // Clear any kill flags on register operands as we'll create a second
25908 // instruction using the same address operands.
25909 if (Operand.isReg())
25910 Operand.setIsKill(false);
25913 MachineInstr *FOpMI = MIB;
25914 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25915 for (int i = 0; i < X86::AddrNumOperands; ++i)
25916 MIB.add(MI.getOperand(i));
25917 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25918 MI.eraseFromParent(); // The pseudo instruction is gone now.
25922 MachineBasicBlock *
25923 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25924 MachineBasicBlock *BB) const {
25925 MachineFunction *MF = BB->getParent();
25926 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25927 DebugLoc DL = MI.getDebugLoc();
25928 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25930 assert(MF->shouldSplitStack());
25932 const bool Is64Bit = Subtarget.is64Bit();
25933 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25935 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25936 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25939 // ... [Till the alloca]
25940 // If stacklet is not large enough, jump to mallocMBB
25943 // Allocate by subtracting from RSP
25944 // Jump to continueMBB
25947 // Allocate by call to runtime
25951 // [rest of original BB]
25954 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25955 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25956 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25958 MachineRegisterInfo &MRI = MF->getRegInfo();
25959 const TargetRegisterClass *AddrRegClass =
25960 getRegClassFor(getPointerTy(MF->getDataLayout()));
25962 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25963 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25964 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25965 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25966 sizeVReg = MI.getOperand(1).getReg(),
25968 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
25970 MachineFunction::iterator MBBIter = ++BB->getIterator();
25972 MF->insert(MBBIter, bumpMBB);
25973 MF->insert(MBBIter, mallocMBB);
25974 MF->insert(MBBIter, continueMBB);
25976 continueMBB->splice(continueMBB->begin(), BB,
25977 std::next(MachineBasicBlock::iterator(MI)), BB->end());
25978 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
25980 // Add code to the main basic block to check if the stack limit has been hit,
25981 // and if so, jump to mallocMBB otherwise to bumpMBB.
25982 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
25983 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
25984 .addReg(tmpSPVReg).addReg(sizeVReg);
25985 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
25986 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
25987 .addReg(SPLimitVReg);
25988 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
25990 // bumpMBB simply decreases the stack pointer, since we know the current
25991 // stacklet has enough space.
25992 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
25993 .addReg(SPLimitVReg);
25994 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
25995 .addReg(SPLimitVReg);
25996 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
25998 // Calls into a routine in libgcc to allocate more space from the heap.
25999 const uint32_t *RegMask =
26000 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26002 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26004 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26005 .addExternalSymbol("__morestack_allocate_stack_space")
26006 .addRegMask(RegMask)
26007 .addReg(X86::RDI, RegState::Implicit)
26008 .addReg(X86::RAX, RegState::ImplicitDefine);
26009 } else if (Is64Bit) {
26010 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26012 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26013 .addExternalSymbol("__morestack_allocate_stack_space")
26014 .addRegMask(RegMask)
26015 .addReg(X86::EDI, RegState::Implicit)
26016 .addReg(X86::EAX, RegState::ImplicitDefine);
26018 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26020 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26021 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26022 .addExternalSymbol("__morestack_allocate_stack_space")
26023 .addRegMask(RegMask)
26024 .addReg(X86::EAX, RegState::ImplicitDefine);
26028 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26031 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26032 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26033 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26035 // Set up the CFG correctly.
26036 BB->addSuccessor(bumpMBB);
26037 BB->addSuccessor(mallocMBB);
26038 mallocMBB->addSuccessor(continueMBB);
26039 bumpMBB->addSuccessor(continueMBB);
26041 // Take care of the PHI nodes.
26042 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26043 MI.getOperand(0).getReg())
26044 .addReg(mallocPtrVReg)
26046 .addReg(bumpSPPtrVReg)
26049 // Delete the original pseudo instruction.
26050 MI.eraseFromParent();
26053 return continueMBB;
26056 MachineBasicBlock *
26057 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26058 MachineBasicBlock *BB) const {
26059 MachineFunction *MF = BB->getParent();
26060 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26061 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26062 DebugLoc DL = MI.getDebugLoc();
26064 assert(!isAsynchronousEHPersonality(
26065 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26066 "SEH does not use catchret!");
26068 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26069 if (!Subtarget.is32Bit())
26072 // C++ EH creates a new target block to hold the restore code, and wires up
26073 // the new block to the return destination with a normal JMP_4.
26074 MachineBasicBlock *RestoreMBB =
26075 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26076 assert(BB->succ_size() == 1);
26077 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26078 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26079 BB->addSuccessor(RestoreMBB);
26080 MI.getOperand(0).setMBB(RestoreMBB);
26082 auto RestoreMBBI = RestoreMBB->begin();
26083 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26084 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26088 MachineBasicBlock *
26089 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26090 MachineBasicBlock *BB) const {
26091 MachineFunction *MF = BB->getParent();
26092 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26093 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26094 // Only 32-bit SEH requires special handling for catchpad.
26095 if (IsSEH && Subtarget.is32Bit()) {
26096 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26097 DebugLoc DL = MI.getDebugLoc();
26098 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26100 MI.eraseFromParent();
26104 MachineBasicBlock *
26105 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26106 MachineBasicBlock *BB) const {
26107 // So, here we replace TLSADDR with the sequence:
26108 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26109 // We need this because TLSADDR is lowered into calls
26110 // inside MC, therefore without the two markers shrink-wrapping
26111 // may push the prologue/epilogue pass them.
26112 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26113 DebugLoc DL = MI.getDebugLoc();
26114 MachineFunction &MF = *BB->getParent();
26116 // Emit CALLSEQ_START right before the instruction.
26117 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26118 MachineInstrBuilder CallseqStart =
26119 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26120 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26122 // Emit CALLSEQ_END right after the instruction.
26123 // We don't call erase from parent because we want to keep the
26124 // original instruction around.
26125 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26126 MachineInstrBuilder CallseqEnd =
26127 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26128 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26133 MachineBasicBlock *
26134 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26135 MachineBasicBlock *BB) const {
26136 // This is pretty easy. We're taking the value that we received from
26137 // our load from the relocation, sticking it in either RDI (x86-64)
26138 // or EAX and doing an indirect call. The return value will then
26139 // be in the normal return register.
26140 MachineFunction *F = BB->getParent();
26141 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26142 DebugLoc DL = MI.getDebugLoc();
26144 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26145 assert(MI.getOperand(3).isGlobal() && "This should be a global");
26147 // Get a register mask for the lowered call.
26148 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26149 // proper register mask.
26150 const uint32_t *RegMask =
26151 Subtarget.is64Bit() ?
26152 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26153 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26154 if (Subtarget.is64Bit()) {
26155 MachineInstrBuilder MIB =
26156 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26160 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26161 MI.getOperand(3).getTargetFlags())
26163 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26164 addDirectMem(MIB, X86::RDI);
26165 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26166 } else if (!isPositionIndependent()) {
26167 MachineInstrBuilder MIB =
26168 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26172 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26173 MI.getOperand(3).getTargetFlags())
26175 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26176 addDirectMem(MIB, X86::EAX);
26177 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26179 MachineInstrBuilder MIB =
26180 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26181 .addReg(TII->getGlobalBaseReg(F))
26184 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26185 MI.getOperand(3).getTargetFlags())
26187 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26188 addDirectMem(MIB, X86::EAX);
26189 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26192 MI.eraseFromParent(); // The pseudo instruction is gone now.
26196 MachineBasicBlock *
26197 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26198 MachineBasicBlock *MBB) const {
26199 DebugLoc DL = MI.getDebugLoc();
26200 MachineFunction *MF = MBB->getParent();
26201 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26202 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26203 MachineRegisterInfo &MRI = MF->getRegInfo();
26205 const BasicBlock *BB = MBB->getBasicBlock();
26206 MachineFunction::iterator I = ++MBB->getIterator();
26208 // Memory Reference
26209 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26210 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26213 unsigned MemOpndSlot = 0;
26215 unsigned CurOp = 0;
26217 DstReg = MI.getOperand(CurOp++).getReg();
26218 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26219 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26221 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26222 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26224 MemOpndSlot = CurOp;
26226 MVT PVT = getPointerTy(MF->getDataLayout());
26227 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26228 "Invalid Pointer Size!");
26230 // For v = setjmp(buf), we generate
26233 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26234 // SjLjSetup restoreMBB
26240 // v = phi(main, restore)
26243 // if base pointer being used, load it from frame
26246 MachineBasicBlock *thisMBB = MBB;
26247 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26248 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26249 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26250 MF->insert(I, mainMBB);
26251 MF->insert(I, sinkMBB);
26252 MF->push_back(restoreMBB);
26253 restoreMBB->setHasAddressTaken();
26255 MachineInstrBuilder MIB;
26257 // Transfer the remainder of BB and its successor edges to sinkMBB.
26258 sinkMBB->splice(sinkMBB->begin(), MBB,
26259 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26260 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26263 unsigned PtrStoreOpc = 0;
26264 unsigned LabelReg = 0;
26265 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26266 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26267 !isPositionIndependent();
26269 // Prepare IP either in reg or imm.
26270 if (!UseImmLabel) {
26271 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26272 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26273 LabelReg = MRI.createVirtualRegister(PtrRC);
26274 if (Subtarget.is64Bit()) {
26275 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26279 .addMBB(restoreMBB)
26282 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26283 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26284 .addReg(XII->getGlobalBaseReg(MF))
26287 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26291 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26293 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26294 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26295 if (i == X86::AddrDisp)
26296 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26298 MIB.add(MI.getOperand(MemOpndSlot + i));
26301 MIB.addReg(LabelReg);
26303 MIB.addMBB(restoreMBB);
26304 MIB.setMemRefs(MMOBegin, MMOEnd);
26306 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26307 .addMBB(restoreMBB);
26309 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26310 MIB.addRegMask(RegInfo->getNoPreservedMask());
26311 thisMBB->addSuccessor(mainMBB);
26312 thisMBB->addSuccessor(restoreMBB);
26316 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26317 mainMBB->addSuccessor(sinkMBB);
26320 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26321 TII->get(X86::PHI), DstReg)
26322 .addReg(mainDstReg).addMBB(mainMBB)
26323 .addReg(restoreDstReg).addMBB(restoreMBB);
26326 if (RegInfo->hasBasePointer(*MF)) {
26327 const bool Uses64BitFramePtr =
26328 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26329 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26330 X86FI->setRestoreBasePointer(MF);
26331 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26332 unsigned BasePtr = RegInfo->getBaseRegister();
26333 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26334 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26335 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26336 .setMIFlag(MachineInstr::FrameSetup);
26338 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26339 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26340 restoreMBB->addSuccessor(sinkMBB);
26342 MI.eraseFromParent();
26346 MachineBasicBlock *
26347 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26348 MachineBasicBlock *MBB) const {
26349 DebugLoc DL = MI.getDebugLoc();
26350 MachineFunction *MF = MBB->getParent();
26351 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26352 MachineRegisterInfo &MRI = MF->getRegInfo();
26354 // Memory Reference
26355 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26356 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26358 MVT PVT = getPointerTy(MF->getDataLayout());
26359 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26360 "Invalid Pointer Size!");
26362 const TargetRegisterClass *RC =
26363 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26364 unsigned Tmp = MRI.createVirtualRegister(RC);
26365 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26366 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26367 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26368 unsigned SP = RegInfo->getStackRegister();
26370 MachineInstrBuilder MIB;
26372 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26373 const int64_t SPOffset = 2 * PVT.getStoreSize();
26375 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26376 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26379 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26380 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26381 MIB.add(MI.getOperand(i));
26382 MIB.setMemRefs(MMOBegin, MMOEnd);
26384 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26385 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26386 if (i == X86::AddrDisp)
26387 MIB.addDisp(MI.getOperand(i), LabelOffset);
26389 MIB.add(MI.getOperand(i));
26391 MIB.setMemRefs(MMOBegin, MMOEnd);
26393 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26394 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26395 if (i == X86::AddrDisp)
26396 MIB.addDisp(MI.getOperand(i), SPOffset);
26398 MIB.add(MI.getOperand(i));
26400 MIB.setMemRefs(MMOBegin, MMOEnd);
26402 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26404 MI.eraseFromParent();
26408 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26409 MachineBasicBlock *MBB,
26410 MachineBasicBlock *DispatchBB,
26412 DebugLoc DL = MI.getDebugLoc();
26413 MachineFunction *MF = MBB->getParent();
26414 MachineRegisterInfo *MRI = &MF->getRegInfo();
26415 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26417 MVT PVT = getPointerTy(MF->getDataLayout());
26418 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26423 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26424 !isPositionIndependent();
26427 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26429 const TargetRegisterClass *TRC =
26430 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26431 VR = MRI->createVirtualRegister(TRC);
26432 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26434 if (Subtarget.is64Bit())
26435 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26439 .addMBB(DispatchBB)
26442 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26443 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26446 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26450 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26451 addFrameReference(MIB, FI, 36);
26453 MIB.addMBB(DispatchBB);
26458 MachineBasicBlock *
26459 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26460 MachineBasicBlock *BB) const {
26461 DebugLoc DL = MI.getDebugLoc();
26462 MachineFunction *MF = BB->getParent();
26463 MachineFrameInfo &MFI = MF->getFrameInfo();
26464 MachineRegisterInfo *MRI = &MF->getRegInfo();
26465 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26466 int FI = MFI.getFunctionContextIndex();
26468 // Get a mapping of the call site numbers to all of the landing pads they're
26469 // associated with.
26470 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26471 unsigned MaxCSNum = 0;
26472 for (auto &MBB : *MF) {
26473 if (!MBB.isEHPad())
26476 MCSymbol *Sym = nullptr;
26477 for (const auto &MI : MBB) {
26478 if (MI.isDebugValue())
26481 assert(MI.isEHLabel() && "expected EH_LABEL");
26482 Sym = MI.getOperand(0).getMCSymbol();
26486 if (!MF->hasCallSiteLandingPad(Sym))
26489 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26490 CallSiteNumToLPad[CSI].push_back(&MBB);
26491 MaxCSNum = std::max(MaxCSNum, CSI);
26495 // Get an ordered list of the machine basic blocks for the jump table.
26496 std::vector<MachineBasicBlock *> LPadList;
26497 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26498 LPadList.reserve(CallSiteNumToLPad.size());
26500 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26501 for (auto &LP : CallSiteNumToLPad[CSI]) {
26502 LPadList.push_back(LP);
26503 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26507 assert(!LPadList.empty() &&
26508 "No landing pad destinations for the dispatch jump table!");
26510 // Create the MBBs for the dispatch code.
26512 // Shove the dispatch's address into the return slot in the function context.
26513 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26514 DispatchBB->setIsEHPad(true);
26516 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26517 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26518 DispatchBB->addSuccessor(TrapBB);
26520 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26521 DispatchBB->addSuccessor(DispContBB);
26524 MF->push_back(DispatchBB);
26525 MF->push_back(DispContBB);
26526 MF->push_back(TrapBB);
26528 // Insert code into the entry block that creates and registers the function
26530 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26532 // Create the jump table and associated information
26533 MachineJumpTableInfo *JTI =
26534 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26535 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26537 const X86RegisterInfo &RI = TII->getRegisterInfo();
26538 // Add a register mask with no preserved registers. This results in all
26539 // registers being marked as clobbered.
26540 if (RI.hasBasePointer(*MF)) {
26541 const bool FPIs64Bit =
26542 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26543 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26544 MFI->setRestoreBasePointer(MF);
26546 unsigned FP = RI.getFrameRegister(*MF);
26547 unsigned BP = RI.getBaseRegister();
26548 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26549 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26550 MFI->getRestoreBasePointerOffset())
26551 .addRegMask(RI.getNoPreservedMask());
26553 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26554 .addRegMask(RI.getNoPreservedMask());
26557 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26558 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26560 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26562 .addImm(LPadList.size());
26563 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26565 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26566 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26569 BuildMI(DispContBB, DL,
26570 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26572 .addImm(Subtarget.is64Bit() ? 8 : 4)
26574 .addJumpTableIndex(MJTI)
26577 // Add the jump table entries as successors to the MBB.
26578 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26579 for (auto &LP : LPadList)
26580 if (SeenMBBs.insert(LP).second)
26581 DispContBB->addSuccessor(LP);
26583 // N.B. the order the invoke BBs are processed in doesn't matter here.
26584 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26585 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26586 for (MachineBasicBlock *MBB : InvokeBBs) {
26587 // Remove the landing pad successor from the invoke block and replace it
26588 // with the new dispatch block.
26589 // Keep a copy of Successors since it's modified inside the loop.
26590 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26592 // FIXME: Avoid quadratic complexity.
26593 for (auto MBBS : Successors) {
26594 if (MBBS->isEHPad()) {
26595 MBB->removeSuccessor(MBBS);
26596 MBBLPads.push_back(MBBS);
26600 MBB->addSuccessor(DispatchBB);
26602 // Find the invoke call and mark all of the callee-saved registers as
26603 // 'implicit defined' so that they're spilled. This prevents code from
26604 // moving instructions to before the EH block, where they will never be
26606 for (auto &II : reverse(*MBB)) {
26610 DenseMap<unsigned, bool> DefRegs;
26611 for (auto &MOp : II.operands())
26613 DefRegs[MOp.getReg()] = true;
26615 MachineInstrBuilder MIB(*MF, &II);
26616 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26617 unsigned Reg = SavedRegs[RI];
26619 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26626 // Mark all former landing pads as non-landing pads. The dispatch is the only
26627 // landing pad now.
26628 for (auto &LP : MBBLPads)
26629 LP->setIsEHPad(false);
26631 // The instruction is gone now.
26632 MI.eraseFromParent();
26636 MachineBasicBlock *
26637 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26638 MachineBasicBlock *BB) const {
26639 MachineFunction *MF = BB->getParent();
26640 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26641 DebugLoc DL = MI.getDebugLoc();
26643 switch (MI.getOpcode()) {
26644 default: llvm_unreachable("Unexpected instr type to insert");
26645 case X86::TAILJMPd64:
26646 case X86::TAILJMPr64:
26647 case X86::TAILJMPm64:
26648 case X86::TAILJMPr64_REX:
26649 case X86::TAILJMPm64_REX:
26650 llvm_unreachable("TAILJMP64 would not be touched here.");
26651 case X86::TCRETURNdi64:
26652 case X86::TCRETURNri64:
26653 case X86::TCRETURNmi64:
26655 case X86::TLS_addr32:
26656 case X86::TLS_addr64:
26657 case X86::TLS_base_addr32:
26658 case X86::TLS_base_addr64:
26659 return EmitLoweredTLSAddr(MI, BB);
26660 case X86::CATCHRET:
26661 return EmitLoweredCatchRet(MI, BB);
26662 case X86::CATCHPAD:
26663 return EmitLoweredCatchPad(MI, BB);
26664 case X86::SEG_ALLOCA_32:
26665 case X86::SEG_ALLOCA_64:
26666 return EmitLoweredSegAlloca(MI, BB);
26667 case X86::TLSCall_32:
26668 case X86::TLSCall_64:
26669 return EmitLoweredTLSCall(MI, BB);
26670 case X86::CMOV_FR32:
26671 case X86::CMOV_FR64:
26672 case X86::CMOV_FR128:
26673 case X86::CMOV_GR8:
26674 case X86::CMOV_GR16:
26675 case X86::CMOV_GR32:
26676 case X86::CMOV_RFP32:
26677 case X86::CMOV_RFP64:
26678 case X86::CMOV_RFP80:
26679 case X86::CMOV_V2F64:
26680 case X86::CMOV_V2I64:
26681 case X86::CMOV_V4F32:
26682 case X86::CMOV_V4F64:
26683 case X86::CMOV_V4I64:
26684 case X86::CMOV_V16F32:
26685 case X86::CMOV_V8F32:
26686 case X86::CMOV_V8F64:
26687 case X86::CMOV_V8I64:
26688 case X86::CMOV_V8I1:
26689 case X86::CMOV_V16I1:
26690 case X86::CMOV_V32I1:
26691 case X86::CMOV_V64I1:
26692 return EmitLoweredSelect(MI, BB);
26694 case X86::RDFLAGS32:
26695 case X86::RDFLAGS64: {
26697 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26698 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26699 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26700 // Permit reads of the FLAGS register without it being defined.
26701 // This intrinsic exists to read external processor state in flags, such as
26702 // the trap flag, interrupt flag, and direction flag, none of which are
26703 // modeled by the backend.
26704 Push->getOperand(2).setIsUndef();
26705 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26707 MI.eraseFromParent(); // The pseudo is gone now.
26711 case X86::WRFLAGS32:
26712 case X86::WRFLAGS64: {
26714 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26716 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26717 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26718 BuildMI(*BB, MI, DL, TII->get(PopF));
26720 MI.eraseFromParent(); // The pseudo is gone now.
26724 case X86::RELEASE_FADD32mr:
26725 case X86::RELEASE_FADD64mr:
26726 return EmitLoweredAtomicFP(MI, BB);
26728 case X86::FP32_TO_INT16_IN_MEM:
26729 case X86::FP32_TO_INT32_IN_MEM:
26730 case X86::FP32_TO_INT64_IN_MEM:
26731 case X86::FP64_TO_INT16_IN_MEM:
26732 case X86::FP64_TO_INT32_IN_MEM:
26733 case X86::FP64_TO_INT64_IN_MEM:
26734 case X86::FP80_TO_INT16_IN_MEM:
26735 case X86::FP80_TO_INT32_IN_MEM:
26736 case X86::FP80_TO_INT64_IN_MEM: {
26737 // Change the floating point control register to use "round towards zero"
26738 // mode when truncating to an integer value.
26739 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26740 addFrameReference(BuildMI(*BB, MI, DL,
26741 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26743 // Load the old value of the high byte of the control word...
26745 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26746 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26749 // Set the high part to be round to zero...
26750 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26753 // Reload the modified control word now...
26754 addFrameReference(BuildMI(*BB, MI, DL,
26755 TII->get(X86::FLDCW16m)), CWFrameIdx);
26757 // Restore the memory image of control word to original value
26758 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26761 // Get the X86 opcode to use.
26763 switch (MI.getOpcode()) {
26764 default: llvm_unreachable("illegal opcode!");
26765 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26766 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26767 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26768 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26769 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26770 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26771 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26772 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26773 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26776 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26777 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26778 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26780 // Reload the original control word now.
26781 addFrameReference(BuildMI(*BB, MI, DL,
26782 TII->get(X86::FLDCW16m)), CWFrameIdx);
26784 MI.eraseFromParent(); // The pseudo instruction is gone now.
26787 // String/text processing lowering.
26788 case X86::PCMPISTRM128REG:
26789 case X86::VPCMPISTRM128REG:
26790 case X86::PCMPISTRM128MEM:
26791 case X86::VPCMPISTRM128MEM:
26792 case X86::PCMPESTRM128REG:
26793 case X86::VPCMPESTRM128REG:
26794 case X86::PCMPESTRM128MEM:
26795 case X86::VPCMPESTRM128MEM:
26796 assert(Subtarget.hasSSE42() &&
26797 "Target must have SSE4.2 or AVX features enabled");
26798 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26800 // String/text processing lowering.
26801 case X86::PCMPISTRIREG:
26802 case X86::VPCMPISTRIREG:
26803 case X86::PCMPISTRIMEM:
26804 case X86::VPCMPISTRIMEM:
26805 case X86::PCMPESTRIREG:
26806 case X86::VPCMPESTRIREG:
26807 case X86::PCMPESTRIMEM:
26808 case X86::VPCMPESTRIMEM:
26809 assert(Subtarget.hasSSE42() &&
26810 "Target must have SSE4.2 or AVX features enabled");
26811 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26813 // Thread synchronization.
26815 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26816 case X86::MONITORX:
26817 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26821 return emitClzero(&MI, BB, Subtarget);
26825 return emitWRPKRU(MI, BB, Subtarget);
26827 return emitRDPKRU(MI, BB, Subtarget);
26830 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26832 case X86::VASTART_SAVE_XMM_REGS:
26833 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26835 case X86::VAARG_64:
26836 return EmitVAARG64WithCustomInserter(MI, BB);
26838 case X86::EH_SjLj_SetJmp32:
26839 case X86::EH_SjLj_SetJmp64:
26840 return emitEHSjLjSetJmp(MI, BB);
26842 case X86::EH_SjLj_LongJmp32:
26843 case X86::EH_SjLj_LongJmp64:
26844 return emitEHSjLjLongJmp(MI, BB);
26846 case X86::Int_eh_sjlj_setup_dispatch:
26847 return EmitSjLjDispatchBlock(MI, BB);
26849 case TargetOpcode::STATEPOINT:
26850 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26851 // this point in the process. We diverge later.
26852 return emitPatchPoint(MI, BB);
26854 case TargetOpcode::STACKMAP:
26855 case TargetOpcode::PATCHPOINT:
26856 return emitPatchPoint(MI, BB);
26858 case TargetOpcode::PATCHABLE_EVENT_CALL:
26859 // Do nothing here, handle in xray instrumentation pass.
26862 case X86::LCMPXCHG8B: {
26863 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26864 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26865 // requires a memory operand. If it happens that current architecture is
26866 // i686 and for current function we need a base pointer
26867 // - which is ESI for i686 - register allocator would not be able to
26868 // allocate registers for an address in form of X(%reg, %reg, Y)
26869 // - there never would be enough unreserved registers during regalloc
26870 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26871 // We are giving a hand to register allocator by precomputing the address in
26872 // a new vreg using LEA.
26874 // If it is not i686 or there is no base pointer - nothing to do here.
26875 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26878 // Even though this code does not necessarily needs the base pointer to
26879 // be ESI, we check for that. The reason: if this assert fails, there are
26880 // some changes happened in the compiler base pointer handling, which most
26881 // probably have to be addressed somehow here.
26882 assert(TRI->getBaseRegister() == X86::ESI &&
26883 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26884 "base pointer in mind");
26886 MachineRegisterInfo &MRI = MF->getRegInfo();
26887 MVT SPTy = getPointerTy(MF->getDataLayout());
26888 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26889 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26891 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26892 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26893 // does not use index register.
26894 if (AM.IndexReg == X86::NoRegister)
26897 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26898 // four operand definitions that are E[ABCD] registers. We skip them and
26899 // then insert the LEA.
26900 MachineBasicBlock::iterator MBBI(MI);
26901 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26902 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26905 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26907 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26911 case X86::LCMPXCHG16B:
26913 case X86::LCMPXCHG8B_SAVE_EBX:
26914 case X86::LCMPXCHG16B_SAVE_RBX: {
26916 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26917 if (!BB->isLiveIn(BasePtr))
26918 BB->addLiveIn(BasePtr);
26924 //===----------------------------------------------------------------------===//
26925 // X86 Optimization Hooks
26926 //===----------------------------------------------------------------------===//
26928 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26930 const APInt &DemandedElts,
26931 const SelectionDAG &DAG,
26932 unsigned Depth) const {
26933 unsigned BitWidth = Known.getBitWidth();
26934 unsigned Opc = Op.getOpcode();
26935 EVT VT = Op.getValueType();
26936 assert((Opc >= ISD::BUILTIN_OP_END ||
26937 Opc == ISD::INTRINSIC_WO_CHAIN ||
26938 Opc == ISD::INTRINSIC_W_CHAIN ||
26939 Opc == ISD::INTRINSIC_VOID) &&
26940 "Should use MaskedValueIsZero if you don't know whether Op"
26941 " is a target node!");
26957 // These nodes' second result is a boolean.
26958 if (Op.getResNo() == 0)
26961 case X86ISD::SETCC:
26962 Known.Zero.setBitsFrom(1);
26964 case X86ISD::MOVMSK: {
26965 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26966 Known.Zero.setBitsFrom(NumLoBits);
26969 case X86ISD::VSHLI:
26970 case X86ISD::VSRLI: {
26971 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
26972 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
26973 Known.setAllZero();
26977 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
26978 unsigned ShAmt = ShiftImm->getZExtValue();
26979 if (Opc == X86ISD::VSHLI) {
26980 Known.Zero <<= ShAmt;
26981 Known.One <<= ShAmt;
26982 // Low bits are known zero.
26983 Known.Zero.setLowBits(ShAmt);
26985 Known.Zero.lshrInPlace(ShAmt);
26986 Known.One.lshrInPlace(ShAmt);
26987 // High bits are known zero.
26988 Known.Zero.setHighBits(ShAmt);
26993 case X86ISD::VZEXT: {
26994 SDValue N0 = Op.getOperand(0);
26995 unsigned NumElts = VT.getVectorNumElements();
26997 EVT SrcVT = N0.getValueType();
26998 unsigned InNumElts = SrcVT.getVectorNumElements();
26999 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27000 assert(InNumElts >= NumElts && "Illegal VZEXT input");
27002 Known = KnownBits(InBitWidth);
27003 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27004 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27005 Known = Known.zext(BitWidth);
27006 Known.Zero.setBitsFrom(InBitWidth);
27012 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27013 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27014 unsigned Depth) const {
27015 unsigned VTBits = Op.getScalarValueSizeInBits();
27016 unsigned Opcode = Op.getOpcode();
27018 case X86ISD::SETCC_CARRY:
27019 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27022 case X86ISD::VSEXT: {
27023 SDValue Src = Op.getOperand(0);
27024 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27025 Tmp += VTBits - Src.getScalarValueSizeInBits();
27029 case X86ISD::VSHLI: {
27030 SDValue Src = Op.getOperand(0);
27031 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27032 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27033 if (ShiftVal.uge(VTBits))
27034 return VTBits; // Shifted all bits out --> zero.
27035 if (ShiftVal.uge(Tmp))
27036 return 1; // Shifted all sign bits out --> unknown.
27037 return Tmp - ShiftVal.getZExtValue();
27040 case X86ISD::VSRAI: {
27041 SDValue Src = Op.getOperand(0);
27042 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27043 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27045 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27048 case X86ISD::PCMPGT:
27049 case X86ISD::PCMPEQ:
27051 case X86ISD::VPCOM:
27052 case X86ISD::VPCOMU:
27053 // Vector compares return zero/all-bits result values.
27061 /// Returns true (and the GlobalValue and the offset) if the node is a
27062 /// GlobalAddress + offset.
27063 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27064 const GlobalValue* &GA,
27065 int64_t &Offset) const {
27066 if (N->getOpcode() == X86ISD::Wrapper) {
27067 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27068 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27069 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27073 return TargetLowering::isGAPlusOffset(N, GA, Offset);
27076 // Attempt to match a combined shuffle mask against supported unary shuffle
27078 // TODO: Investigate sharing more of this with shuffle lowering.
27079 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27080 bool AllowFloatDomain, bool AllowIntDomain,
27081 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27082 const X86Subtarget &Subtarget,
27083 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27084 unsigned NumMaskElts = Mask.size();
27085 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27087 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27088 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27089 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27090 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27091 unsigned MaxScale = 64 / MaskEltSize;
27092 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27094 unsigned NumDstElts = NumMaskElts / Scale;
27095 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27096 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27097 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27100 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27101 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
27102 if (SrcVT != MaskVT)
27103 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27104 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27105 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27106 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
27107 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27113 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27114 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27115 isUndefOrEqual(Mask[0], 0) &&
27116 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27117 Shuffle = X86ISD::VZEXT_MOVL;
27118 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27122 // Check if we have SSE3 which will let us use MOVDDUP etc. The
27123 // instructions are no slower than UNPCKLPD but has the option to
27124 // fold the input operand into even an unaligned memory load.
27125 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27126 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
27127 Shuffle = X86ISD::MOVDDUP;
27128 SrcVT = DstVT = MVT::v2f64;
27131 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27132 Shuffle = X86ISD::MOVSLDUP;
27133 SrcVT = DstVT = MVT::v4f32;
27136 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27137 Shuffle = X86ISD::MOVSHDUP;
27138 SrcVT = DstVT = MVT::v4f32;
27143 if (MaskVT.is256BitVector() && AllowFloatDomain) {
27144 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27145 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27146 Shuffle = X86ISD::MOVDDUP;
27147 SrcVT = DstVT = MVT::v4f64;
27150 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27151 Shuffle = X86ISD::MOVSLDUP;
27152 SrcVT = DstVT = MVT::v8f32;
27155 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27156 Shuffle = X86ISD::MOVSHDUP;
27157 SrcVT = DstVT = MVT::v8f32;
27162 if (MaskVT.is512BitVector() && AllowFloatDomain) {
27163 assert(Subtarget.hasAVX512() &&
27164 "AVX512 required for 512-bit vector shuffles");
27165 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27166 Shuffle = X86ISD::MOVDDUP;
27167 SrcVT = DstVT = MVT::v8f64;
27170 if (isTargetShuffleEquivalent(
27171 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27172 Shuffle = X86ISD::MOVSLDUP;
27173 SrcVT = DstVT = MVT::v16f32;
27176 if (isTargetShuffleEquivalent(
27177 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27178 Shuffle = X86ISD::MOVSHDUP;
27179 SrcVT = DstVT = MVT::v16f32;
27184 // Attempt to match against broadcast-from-vector.
27185 if (Subtarget.hasAVX2()) {
27186 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27187 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27188 SrcVT = DstVT = MaskVT;
27189 Shuffle = X86ISD::VBROADCAST;
27197 // Attempt to match a combined shuffle mask against supported unary immediate
27198 // permute instructions.
27199 // TODO: Investigate sharing more of this with shuffle lowering.
27200 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27201 const APInt &Zeroable,
27202 bool AllowFloatDomain,
27203 bool AllowIntDomain,
27204 const X86Subtarget &Subtarget,
27205 unsigned &Shuffle, MVT &ShuffleVT,
27206 unsigned &PermuteImm) {
27207 unsigned NumMaskElts = Mask.size();
27208 unsigned InputSizeInBits = MaskVT.getSizeInBits();
27209 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27210 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27212 bool ContainsZeros =
27213 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27215 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27216 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27217 // Check for lane crossing permutes.
27218 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27219 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27220 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27221 Shuffle = X86ISD::VPERMI;
27222 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27223 PermuteImm = getV4X86ShuffleImm(Mask);
27226 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27227 SmallVector<int, 4> RepeatedMask;
27228 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27229 Shuffle = X86ISD::VPERMI;
27230 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27231 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27235 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27236 // VPERMILPD can permute with a non-repeating shuffle.
27237 Shuffle = X86ISD::VPERMILPI;
27238 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27240 for (int i = 0, e = Mask.size(); i != e; ++i) {
27242 if (M == SM_SentinelUndef)
27244 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27245 PermuteImm |= (M & 1) << i;
27251 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27252 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27253 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27254 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27255 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27256 SmallVector<int, 4> RepeatedMask;
27257 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27258 // Narrow the repeated mask to create 32-bit element permutes.
27259 SmallVector<int, 4> WordMask = RepeatedMask;
27260 if (MaskScalarSizeInBits == 64)
27261 scaleShuffleMask(2, RepeatedMask, WordMask);
27263 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
27264 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
27265 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27266 PermuteImm = getV4X86ShuffleImm(WordMask);
27271 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27272 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
27273 SmallVector<int, 4> RepeatedMask;
27274 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27275 ArrayRef<int> LoMask(Mask.data() + 0, 4);
27276 ArrayRef<int> HiMask(Mask.data() + 4, 4);
27278 // PSHUFLW: permute lower 4 elements only.
27279 if (isUndefOrInRange(LoMask, 0, 4) &&
27280 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27281 Shuffle = X86ISD::PSHUFLW;
27282 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27283 PermuteImm = getV4X86ShuffleImm(LoMask);
27287 // PSHUFHW: permute upper 4 elements only.
27288 if (isUndefOrInRange(HiMask, 4, 8) &&
27289 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27290 // Offset the HiMask so that we can create the shuffle immediate.
27291 int OffsetHiMask[4];
27292 for (int i = 0; i != 4; ++i)
27293 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27295 Shuffle = X86ISD::PSHUFHW;
27296 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27297 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27303 // Attempt to match against byte/bit shifts.
27304 // FIXME: Add 512-bit support.
27305 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27306 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27307 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27308 MaskScalarSizeInBits, Mask,
27309 0, Zeroable, Subtarget);
27310 if (0 < ShiftAmt) {
27311 PermuteImm = (unsigned)ShiftAmt;
27319 // Attempt to match a combined unary shuffle mask against supported binary
27320 // shuffle instructions.
27321 // TODO: Investigate sharing more of this with shuffle lowering.
27322 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27323 bool AllowFloatDomain, bool AllowIntDomain,
27324 SDValue &V1, SDValue &V2, SDLoc &DL,
27326 const X86Subtarget &Subtarget,
27327 unsigned &Shuffle, MVT &ShuffleVT,
27329 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27331 if (MaskVT.is128BitVector()) {
27332 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27334 Shuffle = X86ISD::MOVLHPS;
27335 ShuffleVT = MVT::v4f32;
27338 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27340 Shuffle = X86ISD::MOVHLPS;
27341 ShuffleVT = MVT::v4f32;
27344 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27345 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27347 Shuffle = X86ISD::MOVSD;
27348 ShuffleVT = MaskVT;
27351 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27352 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27353 Shuffle = X86ISD::MOVSS;
27354 ShuffleVT = MaskVT;
27359 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27360 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27361 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27362 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27363 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27364 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27365 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27367 ShuffleVT = MaskVT;
27368 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27369 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27377 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27378 const APInt &Zeroable,
27379 bool AllowFloatDomain,
27380 bool AllowIntDomain,
27381 SDValue &V1, SDValue &V2, SDLoc &DL,
27383 const X86Subtarget &Subtarget,
27384 unsigned &Shuffle, MVT &ShuffleVT,
27385 unsigned &PermuteImm) {
27386 unsigned NumMaskElts = Mask.size();
27387 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27389 // Attempt to match against PALIGNR byte rotate.
27390 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27391 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27392 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27393 if (0 < ByteRotation) {
27394 Shuffle = X86ISD::PALIGNR;
27395 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27396 PermuteImm = ByteRotation;
27401 // Attempt to combine to X86ISD::BLENDI.
27402 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27403 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27404 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27405 uint64_t BlendMask = 0;
27406 bool ForceV1Zero = false, ForceV2Zero = false;
27407 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27408 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27410 if (MaskVT == MVT::v16i16) {
27411 // We can only use v16i16 PBLENDW if the lanes are repeated.
27412 SmallVector<int, 8> RepeatedMask;
27413 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27415 assert(RepeatedMask.size() == 8 &&
27416 "Repeated mask size doesn't match!");
27418 for (int i = 0; i < 8; ++i)
27419 if (RepeatedMask[i] >= 8)
27420 PermuteImm |= 1 << i;
27421 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27422 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27423 Shuffle = X86ISD::BLENDI;
27424 ShuffleVT = MaskVT;
27428 // Determine a type compatible with X86ISD::BLENDI.
27429 ShuffleVT = MaskVT;
27430 if (Subtarget.hasAVX2()) {
27431 if (ShuffleVT == MVT::v4i64)
27432 ShuffleVT = MVT::v8i32;
27433 else if (ShuffleVT == MVT::v2i64)
27434 ShuffleVT = MVT::v4i32;
27436 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27437 ShuffleVT = MVT::v8i16;
27438 else if (ShuffleVT == MVT::v4i64)
27439 ShuffleVT = MVT::v4f64;
27440 else if (ShuffleVT == MVT::v8i32)
27441 ShuffleVT = MVT::v8f32;
27444 if (!ShuffleVT.isFloatingPoint()) {
27445 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27447 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27448 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27449 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27452 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27453 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27454 PermuteImm = (unsigned)BlendMask;
27455 Shuffle = X86ISD::BLENDI;
27461 // Attempt to combine to INSERTPS.
27462 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27463 MaskVT.is128BitVector()) {
27464 if (Zeroable.getBoolValue() &&
27465 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27466 Shuffle = X86ISD::INSERTPS;
27467 ShuffleVT = MVT::v4f32;
27472 // Attempt to combine to SHUFPD.
27473 if (AllowFloatDomain && EltSizeInBits == 64 &&
27474 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27475 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27476 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27477 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27478 Shuffle = X86ISD::SHUFP;
27479 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27484 // Attempt to combine to SHUFPS.
27485 if (AllowFloatDomain && EltSizeInBits == 32 &&
27486 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27487 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27488 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27489 SmallVector<int, 4> RepeatedMask;
27490 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27491 // Match each half of the repeated mask, to determine if its just
27492 // referencing one of the vectors, is zeroable or entirely undef.
27493 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27494 int M0 = RepeatedMask[Offset];
27495 int M1 = RepeatedMask[Offset + 1];
27497 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27498 return DAG.getUNDEF(MaskVT);
27499 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27500 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27501 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27502 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27503 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27504 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27505 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27507 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27508 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27509 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27516 int ShufMask[4] = {-1, -1, -1, -1};
27517 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27518 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27523 Shuffle = X86ISD::SHUFP;
27524 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27525 PermuteImm = getV4X86ShuffleImm(ShufMask);
27534 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27537 /// This is the leaf of the recursive combine below. When we have found some
27538 /// chain of single-use x86 shuffle instructions and accumulated the combined
27539 /// shuffle mask represented by them, this will try to pattern match that mask
27540 /// into either a single instruction if there is a special purpose instruction
27541 /// for this operation, or into a PSHUFB instruction which is a fully general
27542 /// instruction but should only be used to replace chains over a certain depth.
27543 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27544 ArrayRef<int> BaseMask, int Depth,
27545 bool HasVariableMask, SelectionDAG &DAG,
27546 TargetLowering::DAGCombinerInfo &DCI,
27547 const X86Subtarget &Subtarget) {
27548 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27549 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27550 "Unexpected number of shuffle inputs!");
27552 // Find the inputs that enter the chain. Note that multiple uses are OK
27553 // here, we're not going to remove the operands we find.
27554 bool UnaryShuffle = (Inputs.size() == 1);
27555 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27556 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27557 : peekThroughBitcasts(Inputs[1]));
27559 MVT VT1 = V1.getSimpleValueType();
27560 MVT VT2 = V2.getSimpleValueType();
27561 MVT RootVT = Root.getSimpleValueType();
27562 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27563 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27564 "Vector size mismatch");
27569 unsigned NumBaseMaskElts = BaseMask.size();
27570 if (NumBaseMaskElts == 1) {
27571 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27572 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27577 unsigned RootSizeInBits = RootVT.getSizeInBits();
27578 unsigned NumRootElts = RootVT.getVectorNumElements();
27579 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27580 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27581 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27583 // Don't combine if we are a AVX512/EVEX target and the mask element size
27584 // is different from the root element size - this would prevent writemasks
27585 // from being reused.
27586 // TODO - this currently prevents all lane shuffles from occurring.
27587 // TODO - check for writemasks usage instead of always preventing combining.
27588 // TODO - attempt to narrow Mask back to writemask size.
27589 bool IsEVEXShuffle =
27590 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27591 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27594 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27596 // Handle 128-bit lane shuffles of 256-bit vectors.
27597 // TODO - this should support binary shuffles.
27598 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27599 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27600 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27601 return false; // Nothing to do!
27602 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27603 unsigned PermMask = 0;
27604 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27605 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27607 Res = DAG.getBitcast(ShuffleVT, V1);
27608 DCI.AddToWorklist(Res.getNode());
27609 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27610 DAG.getUNDEF(ShuffleVT),
27611 DAG.getConstant(PermMask, DL, MVT::i8));
27612 DCI.AddToWorklist(Res.getNode());
27613 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27618 // For masks that have been widened to 128-bit elements or more,
27619 // narrow back down to 64-bit elements.
27620 SmallVector<int, 64> Mask;
27621 if (BaseMaskEltSizeInBits > 64) {
27622 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27623 int MaskScale = BaseMaskEltSizeInBits / 64;
27624 scaleShuffleMask(MaskScale, BaseMask, Mask);
27626 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27629 unsigned NumMaskElts = Mask.size();
27630 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27632 // Determine the effective mask value type.
27633 FloatDomain &= (32 <= MaskEltSizeInBits);
27634 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27635 : MVT::getIntegerVT(MaskEltSizeInBits);
27636 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27638 // Only allow legal mask types.
27639 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27642 // Attempt to match the mask against known shuffle patterns.
27643 MVT ShuffleSrcVT, ShuffleVT;
27644 unsigned Shuffle, PermuteImm;
27646 // Which shuffle domains are permitted?
27647 // Permit domain crossing at higher combine depths.
27648 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27649 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
27650 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
27652 // Determine zeroable mask elements.
27653 APInt Zeroable(NumMaskElts, 0);
27654 for (unsigned i = 0; i != NumMaskElts; ++i)
27655 if (isUndefOrZero(Mask[i]))
27656 Zeroable.setBit(i);
27658 if (UnaryShuffle) {
27659 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27660 // directly if we don't shuffle the lower element and we shuffle the upper
27661 // (zero) elements within themselves.
27662 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27663 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27664 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27665 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27666 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27667 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27668 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27674 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27675 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27677 if (Depth == 1 && Root.getOpcode() == Shuffle)
27678 return false; // Nothing to do!
27679 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27680 return false; // AVX512 Writemask clash.
27681 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27682 DCI.AddToWorklist(Res.getNode());
27683 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27684 DCI.AddToWorklist(Res.getNode());
27685 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27690 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27691 AllowIntDomain, Subtarget, Shuffle,
27692 ShuffleVT, PermuteImm)) {
27693 if (Depth == 1 && Root.getOpcode() == Shuffle)
27694 return false; // Nothing to do!
27695 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27696 return false; // AVX512 Writemask clash.
27697 Res = DAG.getBitcast(ShuffleVT, V1);
27698 DCI.AddToWorklist(Res.getNode());
27699 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27700 DAG.getConstant(PermuteImm, DL, MVT::i8));
27701 DCI.AddToWorklist(Res.getNode());
27702 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27708 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27709 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27711 if (Depth == 1 && Root.getOpcode() == Shuffle)
27712 return false; // Nothing to do!
27713 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27714 return false; // AVX512 Writemask clash.
27715 V1 = DAG.getBitcast(ShuffleVT, V1);
27716 DCI.AddToWorklist(V1.getNode());
27717 V2 = DAG.getBitcast(ShuffleVT, V2);
27718 DCI.AddToWorklist(V2.getNode());
27719 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27720 DCI.AddToWorklist(Res.getNode());
27721 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27726 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27727 AllowIntDomain, V1, V2, DL, DAG,
27728 Subtarget, Shuffle, ShuffleVT,
27730 if (Depth == 1 && Root.getOpcode() == Shuffle)
27731 return false; // Nothing to do!
27732 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27733 return false; // AVX512 Writemask clash.
27734 V1 = DAG.getBitcast(ShuffleVT, V1);
27735 DCI.AddToWorklist(V1.getNode());
27736 V2 = DAG.getBitcast(ShuffleVT, V2);
27737 DCI.AddToWorklist(V2.getNode());
27738 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27739 DAG.getConstant(PermuteImm, DL, MVT::i8));
27740 DCI.AddToWorklist(Res.getNode());
27741 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27746 // Typically from here on, we need an integer version of MaskVT.
27747 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
27748 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
27750 // Annoyingly, SSE4A instructions don't map into the above match helpers.
27751 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
27752 uint64_t BitLen, BitIdx;
27753 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
27755 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
27756 return false; // Nothing to do!
27757 V1 = DAG.getBitcast(IntMaskVT, V1);
27758 DCI.AddToWorklist(V1.getNode());
27759 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
27760 DAG.getConstant(BitLen, DL, MVT::i8),
27761 DAG.getConstant(BitIdx, DL, MVT::i8));
27762 DCI.AddToWorklist(Res.getNode());
27763 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27768 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
27769 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
27770 return false; // Nothing to do!
27771 V1 = DAG.getBitcast(IntMaskVT, V1);
27772 DCI.AddToWorklist(V1.getNode());
27773 V2 = DAG.getBitcast(IntMaskVT, V2);
27774 DCI.AddToWorklist(V2.getNode());
27775 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
27776 DAG.getConstant(BitLen, DL, MVT::i8),
27777 DAG.getConstant(BitIdx, DL, MVT::i8));
27778 DCI.AddToWorklist(Res.getNode());
27779 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27785 // Don't try to re-form single instruction chains under any circumstances now
27786 // that we've done encoding canonicalization for them.
27790 bool MaskContainsZeros =
27791 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27793 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27794 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27795 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27796 ((Subtarget.hasAVX2() &&
27797 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27798 (Subtarget.hasAVX512() &&
27799 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27800 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27801 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27802 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27803 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27804 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27805 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27806 DCI.AddToWorklist(VPermMask.getNode());
27807 Res = DAG.getBitcast(MaskVT, V1);
27808 DCI.AddToWorklist(Res.getNode());
27809 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27810 DCI.AddToWorklist(Res.getNode());
27811 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27816 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27817 // vector as the second source.
27818 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27819 ((Subtarget.hasAVX512() &&
27820 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27821 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27822 (Subtarget.hasVLX() &&
27823 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27824 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27825 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27826 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27827 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27828 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27829 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27830 for (unsigned i = 0; i != NumMaskElts; ++i)
27831 if (Mask[i] == SM_SentinelZero)
27832 Mask[i] = NumMaskElts + i;
27834 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27835 DCI.AddToWorklist(VPermMask.getNode());
27836 Res = DAG.getBitcast(MaskVT, V1);
27837 DCI.AddToWorklist(Res.getNode());
27838 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27839 DCI.AddToWorklist(Zero.getNode());
27840 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27841 DCI.AddToWorklist(Res.getNode());
27842 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27847 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27848 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27849 ((Subtarget.hasAVX512() &&
27850 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27851 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27852 (Subtarget.hasVLX() &&
27853 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27854 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27855 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27856 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27857 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27858 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27859 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27860 DCI.AddToWorklist(VPermMask.getNode());
27861 V1 = DAG.getBitcast(MaskVT, V1);
27862 DCI.AddToWorklist(V1.getNode());
27863 V2 = DAG.getBitcast(MaskVT, V2);
27864 DCI.AddToWorklist(V2.getNode());
27865 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27866 DCI.AddToWorklist(Res.getNode());
27867 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27874 // See if we can combine a single input shuffle with zeros to a bit-mask,
27875 // which is much simpler than any shuffle.
27876 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27877 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27878 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27879 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27880 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27881 APInt UndefElts(NumMaskElts, 0);
27882 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27883 for (unsigned i = 0; i != NumMaskElts; ++i) {
27885 if (M == SM_SentinelUndef) {
27886 UndefElts.setBit(i);
27889 if (M == SM_SentinelZero)
27891 EltBits[i] = AllOnes;
27893 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27894 DCI.AddToWorklist(BitMask.getNode());
27895 Res = DAG.getBitcast(MaskVT, V1);
27896 DCI.AddToWorklist(Res.getNode());
27897 unsigned AndOpcode =
27898 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27899 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27900 DCI.AddToWorklist(Res.getNode());
27901 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27906 // If we have a single input shuffle with different shuffle patterns in the
27907 // the 128-bit lanes use the variable mask to VPERMILPS.
27908 // TODO Combine other mask types at higher depths.
27909 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27910 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27911 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27912 SmallVector<SDValue, 16> VPermIdx;
27913 for (int M : Mask) {
27915 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27916 VPermIdx.push_back(Idx);
27918 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
27919 DCI.AddToWorklist(VPermMask.getNode());
27920 Res = DAG.getBitcast(MaskVT, V1);
27921 DCI.AddToWorklist(Res.getNode());
27922 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27923 DCI.AddToWorklist(Res.getNode());
27924 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27929 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27930 // to VPERMIL2PD/VPERMIL2PS.
27931 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27932 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27933 MaskVT == MVT::v8f32)) {
27934 // VPERMIL2 Operation.
27935 // Bits[3] - Match Bit.
27936 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27937 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27938 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27939 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27940 SmallVector<int, 8> VPerm2Idx;
27941 unsigned M2ZImm = 0;
27942 for (int M : Mask) {
27943 if (M == SM_SentinelUndef) {
27944 VPerm2Idx.push_back(-1);
27947 if (M == SM_SentinelZero) {
27949 VPerm2Idx.push_back(8);
27952 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27953 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27954 VPerm2Idx.push_back(Index);
27956 V1 = DAG.getBitcast(MaskVT, V1);
27957 DCI.AddToWorklist(V1.getNode());
27958 V2 = DAG.getBitcast(MaskVT, V2);
27959 DCI.AddToWorklist(V2.getNode());
27960 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
27961 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27962 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27963 DAG.getConstant(M2ZImm, DL, MVT::i8));
27964 DCI.AddToWorklist(Res.getNode());
27965 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27970 // If we have 3 or more shuffle instructions or a chain involving a variable
27971 // mask, we can replace them with a single PSHUFB instruction profitably.
27972 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
27973 // instructions, but in practice PSHUFB tends to be *very* fast so we're
27974 // more aggressive.
27975 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27976 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27977 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
27978 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
27979 SmallVector<SDValue, 16> PSHUFBMask;
27980 int NumBytes = RootVT.getSizeInBits() / 8;
27981 int Ratio = NumBytes / NumMaskElts;
27982 for (int i = 0; i < NumBytes; ++i) {
27983 int M = Mask[i / Ratio];
27984 if (M == SM_SentinelUndef) {
27985 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
27988 if (M == SM_SentinelZero) {
27989 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
27992 M = Ratio * M + i % Ratio;
27993 assert ((M / 16) == (i / 16) && "Lane crossing detected");
27994 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
27996 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
27997 Res = DAG.getBitcast(ByteVT, V1);
27998 DCI.AddToWorklist(Res.getNode());
27999 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28000 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28001 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28002 DCI.AddToWorklist(Res.getNode());
28003 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28008 // With XOP, if we have a 128-bit binary input shuffle we can always combine
28009 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28010 // slower than PSHUFB on targets that support both.
28011 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
28012 Subtarget.hasXOP()) {
28013 // VPPERM Mask Operation
28014 // Bits[4:0] - Byte Index (0 - 31)
28015 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28016 SmallVector<SDValue, 16> VPPERMMask;
28018 int Ratio = NumBytes / NumMaskElts;
28019 for (int i = 0; i < NumBytes; ++i) {
28020 int M = Mask[i / Ratio];
28021 if (M == SM_SentinelUndef) {
28022 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28025 if (M == SM_SentinelZero) {
28026 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28029 M = Ratio * M + i % Ratio;
28030 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28032 MVT ByteVT = MVT::v16i8;
28033 V1 = DAG.getBitcast(ByteVT, V1);
28034 DCI.AddToWorklist(V1.getNode());
28035 V2 = DAG.getBitcast(ByteVT, V2);
28036 DCI.AddToWorklist(V2.getNode());
28037 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28038 DCI.AddToWorklist(VPPERMMaskOp.getNode());
28039 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28040 DCI.AddToWorklist(Res.getNode());
28041 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28046 // Failed to find any combines.
28050 // Attempt to constant fold all of the constant source ops.
28051 // Returns true if the entire shuffle is folded to a constant.
28052 // TODO: Extend this to merge multiple constant Ops and update the mask.
28053 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28054 ArrayRef<int> Mask, SDValue Root,
28055 bool HasVariableMask, SelectionDAG &DAG,
28056 TargetLowering::DAGCombinerInfo &DCI,
28057 const X86Subtarget &Subtarget) {
28058 MVT VT = Root.getSimpleValueType();
28060 unsigned SizeInBits = VT.getSizeInBits();
28061 unsigned NumMaskElts = Mask.size();
28062 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28063 unsigned NumOps = Ops.size();
28065 // Extract constant bits from each source op.
28066 bool OneUseConstantOp = false;
28067 SmallVector<APInt, 16> UndefEltsOps(NumOps);
28068 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28069 for (unsigned i = 0; i != NumOps; ++i) {
28070 SDValue SrcOp = Ops[i];
28071 OneUseConstantOp |= SrcOp.hasOneUse();
28072 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28077 // Only fold if at least one of the constants is only used once or
28078 // the combined shuffle has included a variable mask shuffle, this
28079 // is to avoid constant pool bloat.
28080 if (!OneUseConstantOp && !HasVariableMask)
28083 // Shuffle the constant bits according to the mask.
28084 APInt UndefElts(NumMaskElts, 0);
28085 APInt ZeroElts(NumMaskElts, 0);
28086 APInt ConstantElts(NumMaskElts, 0);
28087 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28088 APInt::getNullValue(MaskSizeInBits));
28089 for (unsigned i = 0; i != NumMaskElts; ++i) {
28091 if (M == SM_SentinelUndef) {
28092 UndefElts.setBit(i);
28094 } else if (M == SM_SentinelZero) {
28095 ZeroElts.setBit(i);
28098 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28100 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28101 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28103 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28104 if (SrcUndefElts[SrcMaskIdx]) {
28105 UndefElts.setBit(i);
28109 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28110 APInt &Bits = SrcEltBits[SrcMaskIdx];
28112 ZeroElts.setBit(i);
28116 ConstantElts.setBit(i);
28117 ConstantBitData[i] = Bits;
28119 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28121 // Create the constant data.
28123 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28124 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28126 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28128 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28131 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28132 DCI.AddToWorklist(CstOp.getNode());
28133 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
28137 /// \brief Fully generic combining of x86 shuffle instructions.
28139 /// This should be the last combine run over the x86 shuffle instructions. Once
28140 /// they have been fully optimized, this will recursively consider all chains
28141 /// of single-use shuffle instructions, build a generic model of the cumulative
28142 /// shuffle operation, and check for simpler instructions which implement this
28143 /// operation. We use this primarily for two purposes:
28145 /// 1) Collapse generic shuffles to specialized single instructions when
28146 /// equivalent. In most cases, this is just an encoding size win, but
28147 /// sometimes we will collapse multiple generic shuffles into a single
28148 /// special-purpose shuffle.
28149 /// 2) Look for sequences of shuffle instructions with 3 or more total
28150 /// instructions, and replace them with the slightly more expensive SSSE3
28151 /// PSHUFB instruction if available. We do this as the last combining step
28152 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
28153 /// a suitable short sequence of other instructions. The PSHUFB will either
28154 /// use a register or have to read from memory and so is slightly (but only
28155 /// slightly) more expensive than the other shuffle instructions.
28157 /// Because this is inherently a quadratic operation (for each shuffle in
28158 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28159 /// This should never be an issue in practice as the shuffle lowering doesn't
28160 /// produce sequences of more than 8 instructions.
28162 /// FIXME: We will currently miss some cases where the redundant shuffling
28163 /// would simplify under the threshold for PSHUFB formation because of
28164 /// combine-ordering. To fix this, we should do the redundant instruction
28165 /// combining in this recursive walk.
28166 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28167 int SrcOpIndex, SDValue Root,
28168 ArrayRef<int> RootMask,
28169 ArrayRef<const SDNode*> SrcNodes,
28170 int Depth, bool HasVariableMask,
28172 TargetLowering::DAGCombinerInfo &DCI,
28173 const X86Subtarget &Subtarget) {
28174 // Bound the depth of our recursive combine because this is ultimately
28175 // quadratic in nature.
28179 // Directly rip through bitcasts to find the underlying operand.
28180 SDValue Op = SrcOps[SrcOpIndex];
28181 Op = peekThroughOneUseBitcasts(Op);
28183 MVT VT = Op.getSimpleValueType();
28184 if (!VT.isVector())
28185 return false; // Bail if we hit a non-vector.
28187 assert(Root.getSimpleValueType().isVector() &&
28188 "Shuffles operate on vector types!");
28189 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28190 "Can only combine shuffles of the same vector register size.");
28192 // Extract target shuffle mask and resolve sentinels and inputs.
28193 SmallVector<int, 64> OpMask;
28194 SmallVector<SDValue, 2> OpInputs;
28195 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28198 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28199 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28200 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28202 // Add the inputs to the Ops list, avoiding duplicates.
28203 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28205 int InputIdx0 = -1, InputIdx1 = -1;
28206 for (int i = 0, e = Ops.size(); i < e; ++i) {
28207 SDValue BC = peekThroughBitcasts(Ops[i]);
28208 if (Input0 && BC == peekThroughBitcasts(Input0))
28210 if (Input1 && BC == peekThroughBitcasts(Input1))
28214 if (Input0 && InputIdx0 < 0) {
28215 InputIdx0 = SrcOpIndex;
28216 Ops[SrcOpIndex] = Input0;
28218 if (Input1 && InputIdx1 < 0) {
28219 InputIdx1 = Ops.size();
28220 Ops.push_back(Input1);
28223 assert(((RootMask.size() > OpMask.size() &&
28224 RootMask.size() % OpMask.size() == 0) ||
28225 (OpMask.size() > RootMask.size() &&
28226 OpMask.size() % RootMask.size() == 0) ||
28227 OpMask.size() == RootMask.size()) &&
28228 "The smaller number of elements must divide the larger.");
28230 // This function can be performance-critical, so we rely on the power-of-2
28231 // knowledge that we have about the mask sizes to replace div/rem ops with
28232 // bit-masks and shifts.
28233 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28234 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28235 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28236 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28238 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28239 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28240 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28241 assert((RootRatio == 1 || OpRatio == 1) &&
28242 "Must not have a ratio for both incoming and op masks!");
28244 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28245 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28246 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28247 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28248 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28250 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28252 // Merge this shuffle operation's mask into our accumulated mask. Note that
28253 // this shuffle's mask will be the first applied to the input, followed by the
28254 // root mask to get us all the way to the root value arrangement. The reason
28255 // for this order is that we are recursing up the operation chain.
28256 for (unsigned i = 0; i < MaskWidth; ++i) {
28257 unsigned RootIdx = i >> RootRatioLog2;
28258 if (RootMask[RootIdx] < 0) {
28259 // This is a zero or undef lane, we're done.
28260 Mask[i] = RootMask[RootIdx];
28264 unsigned RootMaskedIdx =
28266 ? RootMask[RootIdx]
28267 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28269 // Just insert the scaled root mask value if it references an input other
28270 // than the SrcOp we're currently inserting.
28271 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28272 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28273 Mask[i] = RootMaskedIdx;
28277 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28278 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28279 if (OpMask[OpIdx] < 0) {
28280 // The incoming lanes are zero or undef, it doesn't matter which ones we
28282 Mask[i] = OpMask[OpIdx];
28286 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28287 unsigned OpMaskedIdx =
28290 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28292 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28293 if (OpMask[OpIdx] < (int)OpMask.size()) {
28294 assert(0 <= InputIdx0 && "Unknown target shuffle input");
28295 OpMaskedIdx += InputIdx0 * MaskWidth;
28297 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28298 OpMaskedIdx += InputIdx1 * MaskWidth;
28301 Mask[i] = OpMaskedIdx;
28304 // Handle the all undef/zero cases early.
28305 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28306 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28309 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28310 // TODO - should we handle the mixed zero/undef case as well? Just returning
28311 // a zero mask will lose information on undef elements possibly reducing
28312 // future combine possibilities.
28313 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28314 Subtarget, DAG, SDLoc(Root)));
28318 // Remove unused shuffle source ops.
28319 resolveTargetShuffleInputsAndMask(Ops, Mask);
28320 assert(!Ops.empty() && "Shuffle with no inputs detected");
28322 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28324 // Update the list of shuffle nodes that have been combined so far.
28325 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28327 CombinedNodes.push_back(Op.getNode());
28329 // See if we can recurse into each shuffle source op (if it's a target
28330 // shuffle). The source op should only be combined if it either has a
28331 // single use (i.e. current Op) or all its users have already been combined.
28332 for (int i = 0, e = Ops.size(); i < e; ++i)
28333 if (Ops[i].getNode()->hasOneUse() ||
28334 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28335 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28336 Depth + 1, HasVariableMask, DAG, DCI,
28340 // Attempt to constant fold all of the constant source ops.
28341 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28345 // We can only combine unary and binary shuffle mask cases.
28346 if (Ops.size() > 2)
28349 // Minor canonicalization of the accumulated shuffle mask to make it easier
28350 // to match below. All this does is detect masks with sequential pairs of
28351 // elements, and shrink them to the half-width mask. It does this in a loop
28352 // so it will reduce the size of the mask to the minimal width mask which
28353 // performs an equivalent shuffle.
28354 SmallVector<int, 64> WidenedMask;
28355 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28356 Mask = std::move(WidenedMask);
28359 // Canonicalization of binary shuffle masks to improve pattern matching by
28360 // commuting the inputs.
28361 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28362 ShuffleVectorSDNode::commuteMask(Mask);
28363 std::swap(Ops[0], Ops[1]);
28366 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28370 /// \brief Get the PSHUF-style mask from PSHUF node.
28372 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28373 /// PSHUF-style masks that can be reused with such instructions.
28374 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28375 MVT VT = N.getSimpleValueType();
28376 SmallVector<int, 4> Mask;
28377 SmallVector<SDValue, 2> Ops;
28380 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28384 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28385 // matter. Check that the upper masks are repeats and remove them.
28386 if (VT.getSizeInBits() > 128) {
28387 int LaneElts = 128 / VT.getScalarSizeInBits();
28389 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28390 for (int j = 0; j < LaneElts; ++j)
28391 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28392 "Mask doesn't repeat in high 128-bit lanes!");
28394 Mask.resize(LaneElts);
28397 switch (N.getOpcode()) {
28398 case X86ISD::PSHUFD:
28400 case X86ISD::PSHUFLW:
28403 case X86ISD::PSHUFHW:
28404 Mask.erase(Mask.begin(), Mask.begin() + 4);
28405 for (int &M : Mask)
28409 llvm_unreachable("No valid shuffle instruction found!");
28413 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28415 /// We walk up the chain and look for a combinable shuffle, skipping over
28416 /// shuffles that we could hoist this shuffle's transformation past without
28417 /// altering anything.
28419 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28420 SelectionDAG &DAG) {
28421 assert(N.getOpcode() == X86ISD::PSHUFD &&
28422 "Called with something other than an x86 128-bit half shuffle!");
28425 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28426 // of the shuffles in the chain so that we can form a fresh chain to replace
28428 SmallVector<SDValue, 8> Chain;
28429 SDValue V = N.getOperand(0);
28430 for (; V.hasOneUse(); V = V.getOperand(0)) {
28431 switch (V.getOpcode()) {
28433 return SDValue(); // Nothing combined!
28436 // Skip bitcasts as we always know the type for the target specific
28440 case X86ISD::PSHUFD:
28441 // Found another dword shuffle.
28444 case X86ISD::PSHUFLW:
28445 // Check that the low words (being shuffled) are the identity in the
28446 // dword shuffle, and the high words are self-contained.
28447 if (Mask[0] != 0 || Mask[1] != 1 ||
28448 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28451 Chain.push_back(V);
28454 case X86ISD::PSHUFHW:
28455 // Check that the high words (being shuffled) are the identity in the
28456 // dword shuffle, and the low words are self-contained.
28457 if (Mask[2] != 2 || Mask[3] != 3 ||
28458 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28461 Chain.push_back(V);
28464 case X86ISD::UNPCKL:
28465 case X86ISD::UNPCKH:
28466 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28467 // shuffle into a preceding word shuffle.
28468 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28469 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28472 // Search for a half-shuffle which we can combine with.
28473 unsigned CombineOp =
28474 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28475 if (V.getOperand(0) != V.getOperand(1) ||
28476 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28478 Chain.push_back(V);
28479 V = V.getOperand(0);
28481 switch (V.getOpcode()) {
28483 return SDValue(); // Nothing to combine.
28485 case X86ISD::PSHUFLW:
28486 case X86ISD::PSHUFHW:
28487 if (V.getOpcode() == CombineOp)
28490 Chain.push_back(V);
28494 V = V.getOperand(0);
28498 } while (V.hasOneUse());
28501 // Break out of the loop if we break out of the switch.
28505 if (!V.hasOneUse())
28506 // We fell out of the loop without finding a viable combining instruction.
28509 // Merge this node's mask and our incoming mask.
28510 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28511 for (int &M : Mask)
28513 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28514 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28516 // Rebuild the chain around this new shuffle.
28517 while (!Chain.empty()) {
28518 SDValue W = Chain.pop_back_val();
28520 if (V.getValueType() != W.getOperand(0).getValueType())
28521 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28523 switch (W.getOpcode()) {
28525 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28527 case X86ISD::UNPCKL:
28528 case X86ISD::UNPCKH:
28529 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28532 case X86ISD::PSHUFD:
28533 case X86ISD::PSHUFLW:
28534 case X86ISD::PSHUFHW:
28535 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28539 if (V.getValueType() != N.getValueType())
28540 V = DAG.getBitcast(N.getValueType(), V);
28542 // Return the new chain to replace N.
28546 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28549 /// We walk up the chain, skipping shuffles of the other half and looking
28550 /// through shuffles which switch halves trying to find a shuffle of the same
28551 /// pair of dwords.
28552 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28554 TargetLowering::DAGCombinerInfo &DCI) {
28556 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28557 "Called with something other than an x86 128-bit half shuffle!");
28559 unsigned CombineOpcode = N.getOpcode();
28561 // Walk up a single-use chain looking for a combinable shuffle.
28562 SDValue V = N.getOperand(0);
28563 for (; V.hasOneUse(); V = V.getOperand(0)) {
28564 switch (V.getOpcode()) {
28566 return false; // Nothing combined!
28569 // Skip bitcasts as we always know the type for the target specific
28573 case X86ISD::PSHUFLW:
28574 case X86ISD::PSHUFHW:
28575 if (V.getOpcode() == CombineOpcode)
28578 // Other-half shuffles are no-ops.
28581 // Break out of the loop if we break out of the switch.
28585 if (!V.hasOneUse())
28586 // We fell out of the loop without finding a viable combining instruction.
28589 // Combine away the bottom node as its shuffle will be accumulated into
28590 // a preceding shuffle.
28591 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28593 // Record the old value.
28596 // Merge this node's mask and our incoming mask (adjusted to account for all
28597 // the pshufd instructions encountered).
28598 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28599 for (int &M : Mask)
28601 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28602 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28604 // Check that the shuffles didn't cancel each other out. If not, we need to
28605 // combine to the new one.
28607 // Replace the combinable shuffle with the combined one, updating all users
28608 // so that we re-evaluate the chain here.
28609 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28614 /// \brief Try to combine x86 target specific shuffles.
28615 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28616 TargetLowering::DAGCombinerInfo &DCI,
28617 const X86Subtarget &Subtarget) {
28619 MVT VT = N.getSimpleValueType();
28620 SmallVector<int, 4> Mask;
28622 unsigned Opcode = N.getOpcode();
28624 case X86ISD::PSHUFD:
28625 case X86ISD::PSHUFLW:
28626 case X86ISD::PSHUFHW:
28627 Mask = getPSHUFShuffleMask(N);
28628 assert(Mask.size() == 4);
28630 case X86ISD::UNPCKL: {
28631 auto Op0 = N.getOperand(0);
28632 auto Op1 = N.getOperand(1);
28633 unsigned Opcode0 = Op0.getOpcode();
28634 unsigned Opcode1 = Op1.getOpcode();
28636 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28637 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28638 // TODO: Add other horizontal operations as required.
28639 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28640 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28642 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28643 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28644 // moves upper half elements into the lower half part. For example:
28646 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28648 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28650 // will be combined to:
28652 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28654 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28655 // happen due to advanced instructions.
28656 if (!VT.is128BitVector())
28659 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28660 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28662 unsigned NumElts = VT.getVectorNumElements();
28663 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28664 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28667 auto ShufOp = Op1.getOperand(0);
28668 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28669 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28673 case X86ISD::BLENDI: {
28674 SDValue V0 = N->getOperand(0);
28675 SDValue V1 = N->getOperand(1);
28676 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28677 "Unexpected input vector types");
28679 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28680 // operands and changing the mask to 1. This saves us a bunch of
28681 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28682 // x86InstrInfo knows how to commute this back after instruction selection
28683 // if it would help register allocation.
28685 // TODO: If optimizing for size or a processor that doesn't suffer from
28686 // partial register update stalls, this should be transformed into a MOVSD
28687 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28689 if (VT == MVT::v2f64)
28690 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28691 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28692 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28693 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28698 case X86ISD::MOVSD:
28699 case X86ISD::MOVSS: {
28700 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28701 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28702 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28703 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28704 if (isZero0 && isZero1)
28707 // We often lower to MOVSD/MOVSS from integer as well as native float
28708 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28709 // easier to combine shuffles later on. We've already accounted for the
28710 // domain switching cost when we decided to lower with it.
28711 bool isFloat = VT.isFloatingPoint();
28712 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28713 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28714 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28715 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28716 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28717 V0 = DAG.getBitcast(NewVT, V0);
28718 V1 = DAG.getBitcast(NewVT, V1);
28719 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28724 case X86ISD::INSERTPS: {
28725 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28726 SDValue Op0 = N.getOperand(0);
28727 SDValue Op1 = N.getOperand(1);
28728 SDValue Op2 = N.getOperand(2);
28729 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28730 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28731 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28732 unsigned ZeroMask = InsertPSMask & 0xF;
28734 // If we zero out all elements from Op0 then we don't need to reference it.
28735 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28736 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28737 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28739 // If we zero out the element from Op1 then we don't need to reference it.
28740 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28741 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28742 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28744 // Attempt to merge insertps Op1 with an inner target shuffle node.
28745 SmallVector<int, 8> TargetMask1;
28746 SmallVector<SDValue, 2> Ops1;
28747 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28748 int M = TargetMask1[SrcIdx];
28749 if (isUndefOrZero(M)) {
28750 // Zero/UNDEF insertion - zero out element and remove dependency.
28751 InsertPSMask |= (1u << DstIdx);
28752 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28753 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28755 // Update insertps mask srcidx and reference the source input directly.
28756 assert(0 <= M && M < 8 && "Shuffle index out of range");
28757 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28758 Op1 = Ops1[M < 4 ? 0 : 1];
28759 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28760 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28763 // Attempt to merge insertps Op0 with an inner target shuffle node.
28764 SmallVector<int, 8> TargetMask0;
28765 SmallVector<SDValue, 2> Ops0;
28766 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28769 bool Updated = false;
28770 bool UseInput00 = false;
28771 bool UseInput01 = false;
28772 for (int i = 0; i != 4; ++i) {
28773 int M = TargetMask0[i];
28774 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28775 // No change if element is already zero or the inserted element.
28777 } else if (isUndefOrZero(M)) {
28778 // If the target mask is undef/zero then we must zero the element.
28779 InsertPSMask |= (1u << i);
28784 // The input vector element must be inline.
28785 if (M != i && M != (i + 4))
28788 // Determine which inputs of the target shuffle we're using.
28789 UseInput00 |= (0 <= M && M < 4);
28790 UseInput01 |= (4 <= M);
28793 // If we're not using both inputs of the target shuffle then use the
28794 // referenced input directly.
28795 if (UseInput00 && !UseInput01) {
28798 } else if (!UseInput00 && UseInput01) {
28804 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28805 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28813 // Nuke no-op shuffles that show up after combining.
28814 if (isNoopShuffleMask(Mask))
28815 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28817 // Look for simplifications involving one or two shuffle instructions.
28818 SDValue V = N.getOperand(0);
28819 switch (N.getOpcode()) {
28822 case X86ISD::PSHUFLW:
28823 case X86ISD::PSHUFHW:
28824 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28826 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28827 return SDValue(); // We combined away this shuffle, so we're done.
28829 // See if this reduces to a PSHUFD which is no more expensive and can
28830 // combine with more operations. Note that it has to at least flip the
28831 // dwords as otherwise it would have been removed as a no-op.
28832 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28833 int DMask[] = {0, 1, 2, 3};
28834 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28835 DMask[DOffset + 0] = DOffset + 1;
28836 DMask[DOffset + 1] = DOffset + 0;
28837 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28838 V = DAG.getBitcast(DVT, V);
28839 DCI.AddToWorklist(V.getNode());
28840 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28841 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28842 DCI.AddToWorklist(V.getNode());
28843 return DAG.getBitcast(VT, V);
28846 // Look for shuffle patterns which can be implemented as a single unpack.
28847 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28848 // only works when we have a PSHUFD followed by two half-shuffles.
28849 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28850 (V.getOpcode() == X86ISD::PSHUFLW ||
28851 V.getOpcode() == X86ISD::PSHUFHW) &&
28852 V.getOpcode() != N.getOpcode() &&
28854 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28855 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28856 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28857 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28858 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28859 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28861 for (int i = 0; i < 4; ++i) {
28862 WordMask[i + NOffset] = Mask[i] + NOffset;
28863 WordMask[i + VOffset] = VMask[i] + VOffset;
28865 // Map the word mask through the DWord mask.
28867 for (int i = 0; i < 8; ++i)
28868 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28869 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28870 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28871 // We can replace all three shuffles with an unpack.
28872 V = DAG.getBitcast(VT, D.getOperand(0));
28873 DCI.AddToWorklist(V.getNode());
28874 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28883 case X86ISD::PSHUFD:
28884 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28893 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28894 /// operation. If true is returned then the operands of ADDSUB operation
28895 /// are written to the parameters \p Opnd0 and \p Opnd1.
28897 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28898 /// so it is easier to generically match. We also insert dummy vector shuffle
28899 /// nodes for the operands which explicitly discard the lanes which are unused
28900 /// by this operation to try to flow through the rest of the combiner
28901 /// the fact that they're unused.
28902 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28903 SDValue &Opnd0, SDValue &Opnd1) {
28905 EVT VT = N->getValueType(0);
28906 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28907 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28908 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28911 // We only handle target-independent shuffles.
28912 // FIXME: It would be easy and harmless to use the target shuffle mask
28913 // extraction tool to support more.
28914 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28917 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28918 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28920 SDValue V1 = N->getOperand(0);
28921 SDValue V2 = N->getOperand(1);
28923 // We require the first shuffle operand to be the FSUB node, and the second to
28924 // be the FADD node.
28925 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28926 ShuffleVectorSDNode::commuteMask(Mask);
28928 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28931 // If there are other uses of these operations we can't fold them.
28932 if (!V1->hasOneUse() || !V2->hasOneUse())
28935 // Ensure that both operations have the same operands. Note that we can
28936 // commute the FADD operands.
28937 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28938 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28939 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28942 // We're looking for blends between FADD and FSUB nodes. We insist on these
28943 // nodes being lined up in a specific expected pattern.
28944 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28945 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28946 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28947 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28948 8, 25, 10, 27, 12, 29, 14, 31})))
28956 /// \brief Try to combine a shuffle into a target-specific add-sub or
28957 /// mul-add-sub node.
28958 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28959 const X86Subtarget &Subtarget,
28960 SelectionDAG &DAG) {
28961 SDValue Opnd0, Opnd1;
28962 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28965 EVT VT = N->getValueType(0);
28968 // Try to generate X86ISD::FMADDSUB node here.
28970 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
28971 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
28973 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
28974 // the ADDSUB idiom has been successfully recognized. There are no known
28975 // X86 targets with 512-bit ADDSUB instructions!
28976 if (VT.is512BitVector())
28979 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
28982 // We are looking for a shuffle where both sources are concatenated with undef
28983 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
28984 // if we can express this as a single-source shuffle, that's preferable.
28985 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
28986 const X86Subtarget &Subtarget) {
28987 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
28990 EVT VT = N->getValueType(0);
28992 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
28993 if (!VT.is128BitVector() && !VT.is256BitVector())
28996 if (VT.getVectorElementType() != MVT::i32 &&
28997 VT.getVectorElementType() != MVT::i64 &&
28998 VT.getVectorElementType() != MVT::f32 &&
28999 VT.getVectorElementType() != MVT::f64)
29002 SDValue N0 = N->getOperand(0);
29003 SDValue N1 = N->getOperand(1);
29005 // Check that both sources are concats with undef.
29006 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29007 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29008 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29009 !N1.getOperand(1).isUndef())
29012 // Construct the new shuffle mask. Elements from the first source retain their
29013 // index, but elements from the second source no longer need to skip an undef.
29014 SmallVector<int, 8> Mask;
29015 int NumElts = VT.getVectorNumElements();
29017 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29018 for (int Elt : SVOp->getMask())
29019 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29022 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29024 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29027 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29028 TargetLowering::DAGCombinerInfo &DCI,
29029 const X86Subtarget &Subtarget) {
29031 EVT VT = N->getValueType(0);
29032 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29033 // If we have legalized the vector types, look for blends of FADD and FSUB
29034 // nodes that we can fuse into an ADDSUB node.
29035 if (TLI.isTypeLegal(VT))
29036 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29039 // During Type Legalization, when promoting illegal vector types,
29040 // the backend might introduce new shuffle dag nodes and bitcasts.
29042 // This code performs the following transformation:
29043 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29044 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29046 // We do this only if both the bitcast and the BINOP dag nodes have
29047 // one use. Also, perform this transformation only if the new binary
29048 // operation is legal. This is to avoid introducing dag nodes that
29049 // potentially need to be further expanded (or custom lowered) into a
29050 // less optimal sequence of dag nodes.
29051 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29052 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29053 N->getOperand(0).getOpcode() == ISD::BITCAST &&
29054 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29055 SDValue N0 = N->getOperand(0);
29056 SDValue N1 = N->getOperand(1);
29058 SDValue BC0 = N0.getOperand(0);
29059 EVT SVT = BC0.getValueType();
29060 unsigned Opcode = BC0.getOpcode();
29061 unsigned NumElts = VT.getVectorNumElements();
29063 if (BC0.hasOneUse() && SVT.isVector() &&
29064 SVT.getVectorNumElements() * 2 == NumElts &&
29065 TLI.isOperationLegal(Opcode, VT)) {
29066 bool CanFold = false;
29072 // isOperationLegal lies for integer ops on floating point types.
29073 CanFold = VT.isInteger();
29078 // isOperationLegal lies for floating point ops on integer types.
29079 CanFold = VT.isFloatingPoint();
29083 unsigned SVTNumElts = SVT.getVectorNumElements();
29084 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29085 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29086 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29087 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29088 CanFold = SVOp->getMaskElt(i) < 0;
29091 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29092 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29093 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29094 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29099 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29100 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29101 // consecutive, non-overlapping, and in the right order.
29102 SmallVector<SDValue, 16> Elts;
29103 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29104 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29105 Elts.push_back(Elt);
29112 if (Elts.size() == VT.getVectorNumElements())
29114 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29117 // For AVX2, we sometimes want to combine
29118 // (vector_shuffle <mask> (concat_vectors t1, undef)
29119 // (concat_vectors t2, undef))
29121 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29122 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29123 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29126 if (isTargetShuffle(N->getOpcode())) {
29128 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29131 // Try recursively combining arbitrary sequences of x86 shuffle
29132 // instructions into higher-order shuffles. We do this after combining
29133 // specific PSHUF instruction sequences into their minimal form so that we
29134 // can evaluate how many specialized shuffle instructions are involved in
29135 // a particular chain.
29136 SmallVector<int, 1> NonceMask; // Just a placeholder.
29137 NonceMask.push_back(0);
29138 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
29139 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
29141 return SDValue(); // This routine will use CombineTo to replace N.
29147 /// Check if a vector extract from a target-specific shuffle of a load can be
29148 /// folded into a single element load.
29149 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29150 /// shuffles have been custom lowered so we need to handle those here.
29151 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29152 TargetLowering::DAGCombinerInfo &DCI) {
29153 if (DCI.isBeforeLegalizeOps())
29156 SDValue InVec = N->getOperand(0);
29157 SDValue EltNo = N->getOperand(1);
29158 EVT EltVT = N->getValueType(0);
29160 if (!isa<ConstantSDNode>(EltNo))
29163 EVT OriginalVT = InVec.getValueType();
29165 // Peek through bitcasts, don't duplicate a load with other uses.
29166 InVec = peekThroughOneUseBitcasts(InVec);
29168 EVT CurrentVT = InVec.getValueType();
29169 if (!CurrentVT.isVector() ||
29170 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29173 if (!isTargetShuffle(InVec.getOpcode()))
29176 // Don't duplicate a load with other uses.
29177 if (!InVec.hasOneUse())
29180 SmallVector<int, 16> ShuffleMask;
29181 SmallVector<SDValue, 2> ShuffleOps;
29183 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29184 ShuffleOps, ShuffleMask, UnaryShuffle))
29187 // Select the input vector, guarding against out of range extract vector.
29188 unsigned NumElems = CurrentVT.getVectorNumElements();
29189 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29190 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29192 if (Idx == SM_SentinelZero)
29193 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29194 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29195 if (Idx == SM_SentinelUndef)
29196 return DAG.getUNDEF(EltVT);
29198 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29199 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29202 // If inputs to shuffle are the same for both ops, then allow 2 uses
29203 unsigned AllowedUses =
29204 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29206 if (LdNode.getOpcode() == ISD::BITCAST) {
29207 // Don't duplicate a load with other uses.
29208 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29211 AllowedUses = 1; // only allow 1 load use if we have a bitcast
29212 LdNode = LdNode.getOperand(0);
29215 if (!ISD::isNormalLoad(LdNode.getNode()))
29218 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29220 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29223 // If there's a bitcast before the shuffle, check if the load type and
29224 // alignment is valid.
29225 unsigned Align = LN0->getAlignment();
29226 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29227 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29228 EltVT.getTypeForEVT(*DAG.getContext()));
29230 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29233 // All checks match so transform back to vector_shuffle so that DAG combiner
29234 // can finish the job
29237 // Create shuffle node taking into account the case that its a unary shuffle
29238 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29239 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29241 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29242 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29246 // Try to match patterns such as
29247 // (i16 bitcast (v16i1 x))
29249 // (i16 movmsk (16i8 sext (v16i1 x)))
29250 // before the illegal vector is scalarized on subtargets that don't have legal
29252 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29253 const X86Subtarget &Subtarget) {
29254 EVT VT = BitCast.getValueType();
29255 SDValue N0 = BitCast.getOperand(0);
29256 EVT VecVT = N0->getValueType(0);
29258 if (!VT.isScalarInteger() || !VecVT.isSimple())
29261 // With AVX512 vxi1 types are legal and we prefer using k-regs.
29262 // MOVMSK is supported in SSE2 or later.
29263 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
29266 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29267 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29268 // v8i16 and v16i16.
29269 // For these two cases, we can shuffle the upper element bytes to a
29270 // consecutive sequence at the start of the vector and treat the results as
29271 // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
29272 // for v16i16 this is not the case, because the shuffle is expensive, so we
29273 // avoid sign-extending to this type entirely.
29274 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29275 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29277 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29278 switch (VecVT.getSimpleVT().SimpleTy) {
29282 SExtVT = MVT::v2i64;
29283 FPCastVT = MVT::v2f64;
29286 SExtVT = MVT::v4i32;
29287 FPCastVT = MVT::v4f32;
29288 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29289 // sign-extend to a 256-bit operation to avoid truncation.
29290 if (N0->getOpcode() == ISD::SETCC &&
29291 N0->getOperand(0)->getValueType(0).is256BitVector() &&
29292 Subtarget.hasInt256()) {
29293 SExtVT = MVT::v4i64;
29294 FPCastVT = MVT::v4f64;
29298 SExtVT = MVT::v8i16;
29299 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29300 // sign-extend to a 256-bit operation to match the compare.
29301 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29302 // 256-bit because the shuffle is cheaper than sign extending the result of
29304 if (N0->getOpcode() == ISD::SETCC &&
29305 N0->getOperand(0)->getValueType(0).is256BitVector() &&
29306 Subtarget.hasInt256()) {
29307 SExtVT = MVT::v8i32;
29308 FPCastVT = MVT::v8f32;
29312 SExtVT = MVT::v16i8;
29313 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29314 // it is not profitable to sign-extend to 256-bit because this will
29315 // require an extra cross-lane shuffle which is more expensive than
29316 // truncating the result of the compare to 128-bits.
29319 // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
29320 if (!Subtarget.hasInt256())
29322 SExtVT = MVT::v32i8;
29327 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29328 if (SExtVT == MVT::v8i16) {
29329 V = DAG.getBitcast(MVT::v16i8, V);
29330 V = DAG.getVectorShuffle(
29331 MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29332 {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29334 assert(SExtVT.getScalarType() != MVT::i16 &&
29335 "Vectors of i16 must be shuffled");
29336 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29337 V = DAG.getBitcast(FPCastVT, V);
29338 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29339 return DAG.getZExtOrTrunc(V, DL, VT);
29342 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29343 TargetLowering::DAGCombinerInfo &DCI,
29344 const X86Subtarget &Subtarget) {
29345 SDValue N0 = N->getOperand(0);
29346 EVT VT = N->getValueType(0);
29347 EVT SrcVT = N0.getValueType();
29349 // Try to match patterns such as
29350 // (i16 bitcast (v16i1 x))
29352 // (i16 movmsk (16i8 sext (v16i1 x)))
29353 // before the setcc result is scalarized on subtargets that don't have legal
29355 if (DCI.isBeforeLegalize())
29356 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29358 // Since MMX types are special and don't usually play with other vector types,
29359 // it's better to handle them early to be sure we emit efficient code by
29360 // avoiding store-load conversions.
29362 // Detect bitcasts between i32 to x86mmx low word.
29363 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29364 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29365 SDValue N00 = N0->getOperand(0);
29366 if (N00.getValueType() == MVT::i32)
29367 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29370 // Detect bitcasts between element or subvector extraction to x86mmx.
29371 if (VT == MVT::x86mmx &&
29372 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29373 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29374 isNullConstant(N0.getOperand(1))) {
29375 SDValue N00 = N0->getOperand(0);
29376 if (N00.getValueType().is128BitVector())
29377 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29378 DAG.getBitcast(MVT::v2i64, N00));
29381 // Detect bitcasts from FP_TO_SINT to x86mmx.
29382 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29383 N0.getOpcode() == ISD::FP_TO_SINT) {
29385 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29386 DAG.getUNDEF(MVT::v2i32));
29387 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29388 DAG.getBitcast(MVT::v2i64, Res));
29391 // Convert a bitcasted integer logic operation that has one bitcasted
29392 // floating-point operand into a floating-point logic operation. This may
29393 // create a load of a constant, but that is cheaper than materializing the
29394 // constant in an integer register and transferring it to an SSE register or
29395 // transferring the SSE operand to integer register and back.
29397 switch (N0.getOpcode()) {
29398 case ISD::AND: FPOpcode = X86ISD::FAND; break;
29399 case ISD::OR: FPOpcode = X86ISD::FOR; break;
29400 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29401 default: return SDValue();
29404 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29405 (Subtarget.hasSSE2() && VT == MVT::f64)))
29408 SDValue LogicOp0 = N0.getOperand(0);
29409 SDValue LogicOp1 = N0.getOperand(1);
29412 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29413 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29414 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29415 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29416 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29417 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29419 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29420 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29421 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29422 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29423 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29424 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29430 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29431 // the elements of a vector.
29432 // Returns the vector that is being reduced on, or SDValue() if a reduction
29433 // was not matched.
29434 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29435 // The pattern must end in an extract from index 0.
29436 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29437 !isNullConstant(Extract->getOperand(1)))
29441 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29443 SDValue Op = Extract->getOperand(0);
29444 // At each stage, we're looking for something that looks like:
29445 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29446 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29447 // i32 undef, i32 undef, i32 undef, i32 undef>
29448 // %a = binop <8 x i32> %op, %s
29449 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29450 // we expect something like:
29451 // <4,5,6,7,u,u,u,u>
29452 // <2,3,u,u,u,u,u,u>
29453 // <1,u,u,u,u,u,u,u>
29454 for (unsigned i = 0; i < Stages; ++i) {
29455 if (Op.getOpcode() != BinOp)
29458 ShuffleVectorSDNode *Shuffle =
29459 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29461 Op = Op.getOperand(1);
29463 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29464 Op = Op.getOperand(0);
29467 // The first operand of the shuffle should be the same as the other operand
29469 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29472 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29473 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29474 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29481 // Given a select, detect the following pattern:
29482 // 1: %2 = zext <N x i8> %0 to <N x i32>
29483 // 2: %3 = zext <N x i8> %1 to <N x i32>
29484 // 3: %4 = sub nsw <N x i32> %2, %3
29485 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29486 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29487 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29488 // This is useful as it is the input into a SAD pattern.
29489 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29491 // Check the condition of the select instruction is greater-than.
29492 SDValue SetCC = Select->getOperand(0);
29493 if (SetCC.getOpcode() != ISD::SETCC)
29495 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29496 if (CC != ISD::SETGT && CC != ISD::SETLT)
29499 SDValue SelectOp1 = Select->getOperand(1);
29500 SDValue SelectOp2 = Select->getOperand(2);
29502 // The following instructions assume SelectOp1 is the subtraction operand
29503 // and SelectOp2 is the negation operand.
29504 // In the case of SETLT this is the other way around.
29505 if (CC == ISD::SETLT)
29506 std::swap(SelectOp1, SelectOp2);
29508 // The second operand of the select should be the negation of the first
29509 // operand, which is implemented as 0 - SelectOp1.
29510 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29511 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29512 SelectOp2.getOperand(1) == SelectOp1))
29515 // The first operand of SetCC is the first operand of the select, which is the
29516 // difference between the two input vectors.
29517 if (SetCC.getOperand(0) != SelectOp1)
29520 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29522 if ((CC == ISD::SETLT) &&
29523 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
29525 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29528 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29529 if ((CC == ISD::SETGT) &&
29530 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29531 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29534 // The first operand of the select is the difference between the two input
29536 if (SelectOp1.getOpcode() != ISD::SUB)
29539 Op0 = SelectOp1.getOperand(0);
29540 Op1 = SelectOp1.getOperand(1);
29542 // Check if the operands of the sub are zero-extended from vectors of i8.
29543 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29544 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29545 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29546 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29552 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29554 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29555 const SDValue &Zext1, const SDLoc &DL) {
29557 // Find the appropriate width for the PSADBW.
29558 EVT InVT = Zext0.getOperand(0).getValueType();
29559 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29561 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29562 // fill in the missing vector elements with 0.
29563 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29564 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29565 Ops[0] = Zext0.getOperand(0);
29566 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29567 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29568 Ops[0] = Zext1.getOperand(0);
29569 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29571 // Actually build the SAD
29572 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29573 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29576 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29577 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29579 const X86Subtarget &Subtarget) {
29580 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29581 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29584 EVT ExtractVT = Extract->getValueType(0);
29585 unsigned BitWidth = ExtractVT.getSizeInBits();
29586 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29587 ExtractVT != MVT::i8)
29590 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29591 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29592 SDValue Match = matchBinOpReduction(Extract, Op);
29596 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29597 // which we can't support here for now.
29598 if (Match.getScalarValueSizeInBits() != BitWidth)
29601 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29602 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29603 if (!(MatchSizeInBits == 128 ||
29604 (MatchSizeInBits == 256 &&
29605 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29608 // Don't bother performing this for 2-element vectors.
29609 if (Match.getValueType().getVectorNumElements() <= 2)
29612 // Check that we are extracting a reduction of all sign bits.
29613 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29616 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29618 if (64 == BitWidth || 32 == BitWidth)
29619 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29620 MatchSizeInBits / BitWidth);
29622 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29625 ISD::CondCode CondCode;
29626 if (Op == ISD::OR) {
29627 // any_of -> MOVMSK != 0
29628 CompareBits = APInt::getNullValue(32);
29629 CondCode = ISD::CondCode::SETNE;
29631 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29632 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29633 CondCode = ISD::CondCode::SETEQ;
29636 // Perform the select as i32/i64 and then truncate to avoid partial register
29638 unsigned ResWidth = std::max(BitWidth, 32u);
29639 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29641 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29642 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29643 SDValue Res = DAG.getBitcast(MaskVT, Match);
29644 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29645 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29646 Ones, Zero, CondCode);
29647 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29653 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29654 const X86Subtarget &Subtarget) {
29655 // PSADBW is only supported on SSE2 and up.
29656 if (!Subtarget.hasSSE2())
29659 // Verify the type we're extracting from is any integer type above i16.
29660 EVT VT = Extract->getOperand(0).getValueType();
29661 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29664 unsigned RegSize = 128;
29665 if (Subtarget.hasBWI())
29667 else if (Subtarget.hasAVX2())
29670 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29671 // TODO: We should be able to handle larger vectors by splitting them before
29672 // feeding them into several SADs, and then reducing over those.
29673 if (RegSize / VT.getVectorNumElements() < 8)
29676 // Match shuffle + add pyramid.
29677 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29679 // The operand is expected to be zero extended from i8
29680 // (verified in detectZextAbsDiff).
29681 // In order to convert to i64 and above, additional any/zero/sign
29682 // extend is expected.
29683 // The zero extend from 32 bit has no mathematical effect on the result.
29684 // Also the sign extend is basically zero extend
29685 // (extends the sign bit which is zero).
29686 // So it is correct to skip the sign/zero extend instruction.
29687 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29688 Root.getOpcode() == ISD::ZERO_EXTEND ||
29689 Root.getOpcode() == ISD::ANY_EXTEND))
29690 Root = Root.getOperand(0);
29692 // If there was a match, we want Root to be a select that is the root of an
29693 // abs-diff pattern.
29694 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29697 // Check whether we have an abs-diff pattern feeding into the select.
29698 SDValue Zext0, Zext1;
29699 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29702 // Create the SAD instruction.
29704 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29706 // If the original vector was wider than 8 elements, sum over the results
29707 // in the SAD vector.
29708 unsigned Stages = Log2_32(VT.getVectorNumElements());
29709 MVT SadVT = SAD.getSimpleValueType();
29711 unsigned SadElems = SadVT.getVectorNumElements();
29713 for(unsigned i = Stages - 3; i > 0; --i) {
29714 SmallVector<int, 16> Mask(SadElems, -1);
29715 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29716 Mask[j] = MaskEnd + j;
29719 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29720 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29724 MVT Type = Extract->getSimpleValueType(0);
29725 unsigned TypeSizeInBits = Type.getSizeInBits();
29726 // Return the lowest TypeSizeInBits bits.
29727 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29728 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29729 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29730 Extract->getOperand(1));
29733 // Attempt to peek through a target shuffle and extract the scalar from the
29735 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29736 TargetLowering::DAGCombinerInfo &DCI,
29737 const X86Subtarget &Subtarget) {
29738 if (DCI.isBeforeLegalizeOps())
29741 SDValue Src = N->getOperand(0);
29742 SDValue Idx = N->getOperand(1);
29744 EVT VT = N->getValueType(0);
29745 EVT SrcVT = Src.getValueType();
29746 EVT SrcSVT = SrcVT.getVectorElementType();
29747 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29749 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29750 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29753 // Resolve the target shuffle inputs and mask.
29754 SmallVector<int, 16> Mask;
29755 SmallVector<SDValue, 2> Ops;
29756 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29759 // Attempt to narrow/widen the shuffle mask to the correct size.
29760 if (Mask.size() != NumSrcElts) {
29761 if ((NumSrcElts % Mask.size()) == 0) {
29762 SmallVector<int, 16> ScaledMask;
29763 int Scale = NumSrcElts / Mask.size();
29764 scaleShuffleMask(Scale, Mask, ScaledMask);
29765 Mask = std::move(ScaledMask);
29766 } else if ((Mask.size() % NumSrcElts) == 0) {
29767 SmallVector<int, 16> WidenedMask;
29768 while (Mask.size() > NumSrcElts &&
29769 canWidenShuffleElements(Mask, WidenedMask))
29770 Mask = std::move(WidenedMask);
29771 // TODO - investigate support for wider shuffle masks with known upper
29772 // undef/zero elements for implicit zero-extension.
29776 // Check if narrowing/widening failed.
29777 if (Mask.size() != NumSrcElts)
29780 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29783 // If the shuffle source element is undef/zero then we can just accept it.
29784 if (SrcIdx == SM_SentinelUndef)
29785 return DAG.getUNDEF(VT);
29787 if (SrcIdx == SM_SentinelZero)
29788 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29789 : DAG.getConstant(0, dl, VT);
29791 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29792 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29793 SrcIdx = SrcIdx % Mask.size();
29795 // We can only extract other elements from 128-bit vectors and in certain
29796 // circumstances, depending on SSE-level.
29797 // TODO: Investigate using extract_subvector for larger vectors.
29798 // TODO: Investigate float/double extraction if it will be just stored.
29799 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29800 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29801 assert(SrcSVT == VT && "Unexpected extraction type");
29802 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29803 DAG.getIntPtrConstant(SrcIdx, dl));
29806 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29807 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29808 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29809 "Unexpected extraction type");
29810 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29811 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29812 DAG.getIntPtrConstant(SrcIdx, dl));
29813 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29814 DAG.getValueType(SrcSVT));
29815 return DAG.getZExtOrTrunc(Assert, dl, VT);
29821 /// Detect vector gather/scatter index generation and convert it from being a
29822 /// bunch of shuffles and extracts into a somewhat faster sequence.
29823 /// For i686, the best sequence is apparently storing the value and loading
29824 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29825 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29826 TargetLowering::DAGCombinerInfo &DCI,
29827 const X86Subtarget &Subtarget) {
29828 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29831 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29834 SDValue InputVector = N->getOperand(0);
29835 SDValue EltIdx = N->getOperand(1);
29837 EVT SrcVT = InputVector.getValueType();
29838 EVT VT = N->getValueType(0);
29839 SDLoc dl(InputVector);
29841 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29842 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29843 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29844 SDValue MMXSrc = InputVector.getOperand(0);
29846 // The bitcast source is a direct mmx result.
29847 if (MMXSrc.getValueType() == MVT::x86mmx)
29848 return DAG.getBitcast(VT, InputVector);
29851 // Detect mmx to i32 conversion through a v2i32 elt extract.
29852 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29853 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29854 SDValue MMXSrc = InputVector.getOperand(0);
29856 // The bitcast source is a direct mmx result.
29857 if (MMXSrc.getValueType() == MVT::x86mmx)
29858 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29861 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29862 isa<ConstantSDNode>(EltIdx) &&
29863 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29864 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29865 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29866 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29867 return DAG.getConstant(Res, dl, MVT::i1);
29870 // Check whether this extract is the root of a sum of absolute differences
29871 // pattern. This has to be done here because we really want it to happen
29872 // pre-legalization,
29873 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29876 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29877 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29880 // Only operate on vectors of 4 elements, where the alternative shuffling
29881 // gets to be more expensive.
29882 if (SrcVT != MVT::v4i32)
29885 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29886 // single use which is a sign-extend or zero-extend, and all elements are
29888 SmallVector<SDNode *, 4> Uses;
29889 unsigned ExtractedElements = 0;
29890 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29891 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29892 if (UI.getUse().getResNo() != InputVector.getResNo())
29895 SDNode *Extract = *UI;
29896 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29899 if (Extract->getValueType(0) != MVT::i32)
29901 if (!Extract->hasOneUse())
29903 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29904 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29906 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29909 // Record which element was extracted.
29910 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29911 Uses.push_back(Extract);
29914 // If not all the elements were used, this may not be worthwhile.
29915 if (ExtractedElements != 15)
29918 // Ok, we've now decided to do the transformation.
29919 // If 64-bit shifts are legal, use the extract-shift sequence,
29920 // otherwise bounce the vector off the cache.
29921 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29924 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29925 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29926 auto &DL = DAG.getDataLayout();
29927 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29928 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29929 DAG.getConstant(0, dl, VecIdxTy));
29930 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29931 DAG.getConstant(1, dl, VecIdxTy));
29933 SDValue ShAmt = DAG.getConstant(
29934 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29935 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29936 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29937 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29938 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29939 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29940 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29942 // Store the value to a temporary stack slot.
29943 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29944 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29945 MachinePointerInfo());
29947 EVT ElementType = SrcVT.getVectorElementType();
29948 unsigned EltSize = ElementType.getSizeInBits() / 8;
29950 // Replace each use (extract) with a load of the appropriate element.
29951 for (unsigned i = 0; i < 4; ++i) {
29952 uint64_t Offset = EltSize * i;
29953 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29954 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29956 SDValue ScalarAddr =
29957 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29959 // Load the scalar.
29961 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29965 // Replace the extracts
29966 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
29967 UE = Uses.end(); UI != UE; ++UI) {
29968 SDNode *Extract = *UI;
29970 uint64_t IdxVal = Extract->getConstantOperandVal(1);
29971 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
29974 // The replacement was made in place; don't return anything.
29978 // TODO - merge with combineExtractVectorElt once it can handle the implicit
29979 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
29980 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
29981 // combineBasicSADPattern.
29982 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
29983 TargetLowering::DAGCombinerInfo &DCI,
29984 const X86Subtarget &Subtarget) {
29985 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
29988 /// If a vector select has an operand that is -1 or 0, try to simplify the
29989 /// select to a bitwise logic operation.
29991 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
29992 TargetLowering::DAGCombinerInfo &DCI,
29993 const X86Subtarget &Subtarget) {
29994 SDValue Cond = N->getOperand(0);
29995 SDValue LHS = N->getOperand(1);
29996 SDValue RHS = N->getOperand(2);
29997 EVT VT = LHS.getValueType();
29998 EVT CondVT = Cond.getValueType();
30000 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30002 if (N->getOpcode() != ISD::VSELECT)
30005 assert(CondVT.isVector() && "Vector select expects a vector selector!");
30007 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30008 // Check if the first operand is all zeros and Cond type is vXi1.
30009 // This situation only applies to avx512.
30010 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30011 CondVT.getVectorElementType() == MVT::i1) {
30012 // Invert the cond to not(cond) : xor(op,allones)=not(op)
30013 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30014 DAG.getAllOnesConstant(DL, CondVT));
30015 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30016 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30019 // To use the condition operand as a bitwise mask, it must have elements that
30020 // are the same size as the select elements. Ie, the condition operand must
30021 // have already been promoted from the IR select condition type <N x i1>.
30022 // Don't check if the types themselves are equal because that excludes
30023 // vector floating-point selects.
30024 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30027 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30028 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30030 // Try to invert the condition if true value is not all 1s and false value is
30032 if (!TValIsAllOnes && !FValIsAllZeros &&
30033 // Check if the selector will be produced by CMPP*/PCMP*.
30034 Cond.getOpcode() == ISD::SETCC &&
30035 // Check if SETCC has already been promoted.
30036 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30038 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30039 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30041 if (TValIsAllZeros || FValIsAllOnes) {
30042 SDValue CC = Cond.getOperand(2);
30043 ISD::CondCode NewCC =
30044 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30045 Cond.getOperand(0).getValueType().isInteger());
30046 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30048 std::swap(LHS, RHS);
30049 TValIsAllOnes = FValIsAllOnes;
30050 FValIsAllZeros = TValIsAllZeros;
30054 // vselect Cond, 111..., 000... -> Cond
30055 if (TValIsAllOnes && FValIsAllZeros)
30056 return DAG.getBitcast(VT, Cond);
30058 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30061 // vselect Cond, 111..., X -> or Cond, X
30062 if (TValIsAllOnes) {
30063 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30064 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30065 return DAG.getBitcast(VT, Or);
30068 // vselect Cond, X, 000... -> and Cond, X
30069 if (FValIsAllZeros) {
30070 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30071 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30072 return DAG.getBitcast(VT, And);
30078 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30079 SDValue Cond = N->getOperand(0);
30080 SDValue LHS = N->getOperand(1);
30081 SDValue RHS = N->getOperand(2);
30084 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30085 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30086 if (!TrueC || !FalseC)
30089 // Don't do this for crazy integer types.
30090 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
30093 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
30094 // so that TrueC (the true value) is larger than FalseC.
30095 bool NeedsCondInvert = false;
30096 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
30097 // Efficiently invertible.
30098 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
30099 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
30100 isa<ConstantSDNode>(Cond.getOperand(1))))) {
30101 NeedsCondInvert = true;
30102 std::swap(TrueC, FalseC);
30105 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
30106 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30107 if (NeedsCondInvert) // Invert the condition if needed.
30108 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30109 DAG.getConstant(1, DL, Cond.getValueType()));
30111 // Zero extend the condition if needed.
30112 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
30114 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30115 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
30116 DAG.getConstant(ShAmt, DL, MVT::i8));
30119 // Optimize cases that will turn into an LEA instruction. This requires
30120 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30121 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30122 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
30123 if (N->getValueType(0) == MVT::i32)
30124 Diff = (unsigned)Diff;
30126 bool IsFastMultiplier = false;
30128 switch ((unsigned char)Diff) {
30131 case 1: // result = add base, cond
30132 case 2: // result = lea base( , cond*2)
30133 case 3: // result = lea base(cond, cond*2)
30134 case 4: // result = lea base( , cond*4)
30135 case 5: // result = lea base(cond, cond*4)
30136 case 8: // result = lea base( , cond*8)
30137 case 9: // result = lea base(cond, cond*8)
30138 IsFastMultiplier = true;
30143 if (IsFastMultiplier) {
30144 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
30145 if (NeedsCondInvert) // Invert the condition if needed.
30146 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30147 DAG.getConstant(1, DL, Cond.getValueType()));
30149 // Zero extend the condition if needed.
30150 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
30151 // Scale the condition by the difference.
30153 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30154 DAG.getConstant(Diff, DL, Cond.getValueType()));
30156 // Add the base if non-zero.
30157 if (FalseC->getAPIntValue() != 0)
30158 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30159 SDValue(FalseC, 0));
30167 // If this is a bitcasted op that can be represented as another type, push the
30168 // the bitcast to the inputs. This allows more opportunities for pattern
30169 // matching masked instructions. This is called when we know that the operation
30170 // is used as one of the inputs of a vselect.
30171 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30172 TargetLowering::DAGCombinerInfo &DCI) {
30173 // Make sure we have a bitcast.
30174 if (OrigOp.getOpcode() != ISD::BITCAST)
30177 SDValue Op = OrigOp.getOperand(0);
30179 // If the operation is used by anything other than the bitcast, we shouldn't
30180 // do this combine as that would replicate the operation.
30181 if (!Op.hasOneUse())
30184 MVT VT = OrigOp.getSimpleValueType();
30185 MVT EltVT = VT.getVectorElementType();
30186 SDLoc DL(Op.getNode());
30188 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30190 Op0 = DAG.getBitcast(VT, Op0);
30191 DCI.AddToWorklist(Op0.getNode());
30192 Op1 = DAG.getBitcast(VT, Op1);
30193 DCI.AddToWorklist(Op1.getNode());
30194 DCI.CombineTo(OrigOp.getNode(),
30195 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30199 unsigned Opcode = Op.getOpcode();
30201 case X86ISD::PALIGNR:
30202 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30203 if (!VT.is128BitVector())
30205 Opcode = X86ISD::VALIGN;
30207 case X86ISD::VALIGN: {
30208 if (EltVT != MVT::i32 && EltVT != MVT::i64)
30210 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30211 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30212 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30213 unsigned EltSize = EltVT.getSizeInBits();
30214 // Make sure we can represent the same shift with the new VT.
30215 if ((ShiftAmt % EltSize) != 0)
30217 Imm = ShiftAmt / EltSize;
30218 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30219 DAG.getConstant(Imm, DL, MVT::i8));
30221 case X86ISD::SHUF128: {
30222 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30224 // Only change element size, not type.
30225 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30227 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30230 case ISD::INSERT_SUBVECTOR: {
30231 unsigned EltSize = EltVT.getSizeInBits();
30232 if (EltSize != 32 && EltSize != 64)
30234 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30235 // Only change element size, not type.
30236 if (EltVT.isInteger() != OpEltVT.isInteger())
30238 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30239 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30240 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30241 DCI.AddToWorklist(Op0.getNode());
30242 // Op1 needs to be bitcasted to a smaller vector with the same element type.
30243 SDValue Op1 = Op.getOperand(1);
30244 MVT Op1VT = MVT::getVectorVT(EltVT,
30245 Op1.getSimpleValueType().getSizeInBits() / EltSize);
30246 Op1 = DAG.getBitcast(Op1VT, Op1);
30247 DCI.AddToWorklist(Op1.getNode());
30248 DCI.CombineTo(OrigOp.getNode(),
30249 DAG.getNode(Opcode, DL, VT, Op0, Op1,
30250 DAG.getIntPtrConstant(Imm, DL)));
30253 case ISD::EXTRACT_SUBVECTOR: {
30254 unsigned EltSize = EltVT.getSizeInBits();
30255 if (EltSize != 32 && EltSize != 64)
30257 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30258 // Only change element size, not type.
30259 if (EltVT.isInteger() != OpEltVT.isInteger())
30261 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
30262 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30263 // Op0 needs to be bitcasted to a larger vector with the same element type.
30264 SDValue Op0 = Op.getOperand(0);
30265 MVT Op0VT = MVT::getVectorVT(EltVT,
30266 Op0.getSimpleValueType().getSizeInBits() / EltSize);
30267 Op0 = DAG.getBitcast(Op0VT, Op0);
30268 DCI.AddToWorklist(Op0.getNode());
30269 DCI.CombineTo(OrigOp.getNode(),
30270 DAG.getNode(Opcode, DL, VT, Op0,
30271 DAG.getIntPtrConstant(Imm, DL)));
30274 case X86ISD::SUBV_BROADCAST: {
30275 unsigned EltSize = EltVT.getSizeInBits();
30276 if (EltSize != 32 && EltSize != 64)
30278 // Only change element size, not type.
30279 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30281 SDValue Op0 = Op.getOperand(0);
30282 MVT Op0VT = MVT::getVectorVT(EltVT,
30283 Op0.getSimpleValueType().getSizeInBits() / EltSize);
30284 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30285 DCI.AddToWorklist(Op0.getNode());
30286 DCI.CombineTo(OrigOp.getNode(),
30287 DAG.getNode(Opcode, DL, VT, Op0));
30295 /// Do target-specific dag combines on SELECT and VSELECT nodes.
30296 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30297 TargetLowering::DAGCombinerInfo &DCI,
30298 const X86Subtarget &Subtarget) {
30300 SDValue Cond = N->getOperand(0);
30301 // Get the LHS/RHS of the select.
30302 SDValue LHS = N->getOperand(1);
30303 SDValue RHS = N->getOperand(2);
30304 EVT VT = LHS.getValueType();
30305 EVT CondVT = Cond.getValueType();
30306 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30308 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
30309 // instructions match the semantics of the common C idiom x<y?x:y but not
30310 // x<=y?x:y, because of how they handle negative zero (which can be
30311 // ignored in unsafe-math mode).
30312 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30313 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
30314 VT != MVT::f80 && VT != MVT::f128 &&
30315 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
30316 (Subtarget.hasSSE2() ||
30317 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
30318 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30320 unsigned Opcode = 0;
30321 // Check for x CC y ? x : y.
30322 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30323 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30327 // Converting this to a min would handle NaNs incorrectly, and swapping
30328 // the operands would cause it to handle comparisons between positive
30329 // and negative zero incorrectly.
30330 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30331 if (!DAG.getTarget().Options.UnsafeFPMath &&
30332 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30334 std::swap(LHS, RHS);
30336 Opcode = X86ISD::FMIN;
30339 // Converting this to a min would handle comparisons between positive
30340 // and negative zero incorrectly.
30341 if (!DAG.getTarget().Options.UnsafeFPMath &&
30342 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30344 Opcode = X86ISD::FMIN;
30347 // Converting this to a min would handle both negative zeros and NaNs
30348 // incorrectly, but we can swap the operands to fix both.
30349 std::swap(LHS, RHS);
30354 Opcode = X86ISD::FMIN;
30358 // Converting this to a max would handle comparisons between positive
30359 // and negative zero incorrectly.
30360 if (!DAG.getTarget().Options.UnsafeFPMath &&
30361 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30363 Opcode = X86ISD::FMAX;
30366 // Converting this to a max would handle NaNs incorrectly, and swapping
30367 // the operands would cause it to handle comparisons between positive
30368 // and negative zero incorrectly.
30369 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30370 if (!DAG.getTarget().Options.UnsafeFPMath &&
30371 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30373 std::swap(LHS, RHS);
30375 Opcode = X86ISD::FMAX;
30378 // Converting this to a max would handle both negative zeros and NaNs
30379 // incorrectly, but we can swap the operands to fix both.
30380 std::swap(LHS, RHS);
30385 Opcode = X86ISD::FMAX;
30388 // Check for x CC y ? y : x -- a min/max with reversed arms.
30389 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30390 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30394 // Converting this to a min would handle comparisons between positive
30395 // and negative zero incorrectly, and swapping the operands would
30396 // cause it to handle NaNs incorrectly.
30397 if (!DAG.getTarget().Options.UnsafeFPMath &&
30398 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30399 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30401 std::swap(LHS, RHS);
30403 Opcode = X86ISD::FMIN;
30406 // Converting this to a min would handle NaNs incorrectly.
30407 if (!DAG.getTarget().Options.UnsafeFPMath &&
30408 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30410 Opcode = X86ISD::FMIN;
30413 // Converting this to a min would handle both negative zeros and NaNs
30414 // incorrectly, but we can swap the operands to fix both.
30415 std::swap(LHS, RHS);
30420 Opcode = X86ISD::FMIN;
30424 // Converting this to a max would handle NaNs incorrectly.
30425 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30427 Opcode = X86ISD::FMAX;
30430 // Converting this to a max would handle comparisons between positive
30431 // and negative zero incorrectly, and swapping the operands would
30432 // cause it to handle NaNs incorrectly.
30433 if (!DAG.getTarget().Options.UnsafeFPMath &&
30434 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30435 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30437 std::swap(LHS, RHS);
30439 Opcode = X86ISD::FMAX;
30442 // Converting this to a max would handle both negative zeros and NaNs
30443 // incorrectly, but we can swap the operands to fix both.
30444 std::swap(LHS, RHS);
30449 Opcode = X86ISD::FMAX;
30455 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30458 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30459 // lowering on KNL. In this case we convert it to
30460 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30461 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30462 // Since SKX these selects have a proper lowering.
30463 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30464 CondVT.getVectorElementType() == MVT::i1 &&
30465 (VT.is128BitVector() || VT.is256BitVector()) &&
30466 (VT.getVectorElementType() == MVT::i8 ||
30467 VT.getVectorElementType() == MVT::i16) &&
30468 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30469 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30470 DCI.AddToWorklist(Cond.getNode());
30471 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30474 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30477 // Canonicalize max and min:
30478 // (x > y) ? x : y -> (x >= y) ? x : y
30479 // (x < y) ? x : y -> (x <= y) ? x : y
30480 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30481 // the need for an extra compare
30482 // against zero. e.g.
30483 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30485 // testl %edi, %edi
30487 // cmovgl %edi, %eax
30491 // cmovsl %eax, %edi
30492 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30493 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30494 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30495 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30500 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30501 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30502 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30503 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30508 // Early exit check
30509 if (!TLI.isTypeLegal(VT))
30512 // Match VSELECTs into subs with unsigned saturation.
30513 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30514 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30515 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30516 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30517 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30519 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30520 // left side invert the predicate to simplify logic below.
30522 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30524 CC = ISD::getSetCCInverse(CC, true);
30525 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30529 if (Other.getNode() && Other->getNumOperands() == 2 &&
30530 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30531 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30532 SDValue CondRHS = Cond->getOperand(1);
30534 // Look for a general sub with unsigned saturation first.
30535 // x >= y ? x-y : 0 --> subus x, y
30536 // x > y ? x-y : 0 --> subus x, y
30537 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30538 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30539 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30541 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30542 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30543 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30544 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30545 // If the RHS is a constant we have to reverse the const
30546 // canonicalization.
30547 // x > C-1 ? x+-C : 0 --> subus x, C
30548 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30549 CondRHSConst->getAPIntValue() ==
30550 (-OpRHSConst->getAPIntValue() - 1))
30551 return DAG.getNode(
30552 X86ISD::SUBUS, DL, VT, OpLHS,
30553 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30555 // Another special case: If C was a sign bit, the sub has been
30556 // canonicalized into a xor.
30557 // FIXME: Would it be better to use computeKnownBits to determine
30558 // whether it's safe to decanonicalize the xor?
30559 // x s< 0 ? x^C : 0 --> subus x, C
30560 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30561 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30562 OpRHSConst->getAPIntValue().isSignMask())
30563 // Note that we have to rebuild the RHS constant here to ensure we
30564 // don't rely on particular values of undef lanes.
30565 return DAG.getNode(
30566 X86ISD::SUBUS, DL, VT, OpLHS,
30567 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30572 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30575 // If this is a *dynamic* select (non-constant condition) and we can match
30576 // this node with one of the variable blend instructions, restructure the
30577 // condition so that blends can use the high (sign) bit of each element and
30578 // use SimplifyDemandedBits to simplify the condition operand.
30579 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30580 !DCI.isBeforeLegalize() &&
30581 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30582 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30584 // Don't optimize vector selects that map to mask-registers.
30588 // We can only handle the cases where VSELECT is directly legal on the
30589 // subtarget. We custom lower VSELECT nodes with constant conditions and
30590 // this makes it hard to see whether a dynamic VSELECT will correctly
30591 // lower, so we both check the operation's status and explicitly handle the
30592 // cases where a *dynamic* blend will fail even though a constant-condition
30593 // blend could be custom lowered.
30594 // FIXME: We should find a better way to handle this class of problems.
30595 // Potentially, we should combine constant-condition vselect nodes
30596 // pre-legalization into shuffles and not mark as many types as custom
30598 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30600 // FIXME: We don't support i16-element blends currently. We could and
30601 // should support them by making *all* the bits in the condition be set
30602 // rather than just the high bit and using an i8-element blend.
30603 if (VT.getVectorElementType() == MVT::i16)
30605 // Dynamic blending was only available from SSE4.1 onward.
30606 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30608 // Byte blends are only available in AVX2
30609 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30612 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30613 APInt DemandedMask(APInt::getSignMask(BitWidth));
30615 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
30616 !DCI.isBeforeLegalizeOps());
30617 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30618 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30619 // If we changed the computation somewhere in the DAG, this change will
30620 // affect all users of Cond. Make sure it is fine and update all the nodes
30621 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30622 // perform wrong optimizations as we messed with the actual expectation
30623 // for the vector boolean values.
30624 if (Cond != TLO.Old) {
30625 // Check all uses of the condition operand to check whether it will be
30626 // consumed by non-BLEND instructions. Those may require that all bits
30627 // are set properly.
30628 for (SDNode *U : Cond->uses()) {
30629 // TODO: Add other opcodes eventually lowered into BLEND.
30630 if (U->getOpcode() != ISD::VSELECT)
30634 // Update all users of the condition before committing the change, so
30635 // that the VSELECT optimizations that expect the correct vector boolean
30636 // value will not be triggered.
30637 for (SDNode *U : Cond->uses()) {
30638 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30639 U->getValueType(0), Cond, U->getOperand(1),
30641 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30643 DCI.CommitTargetLoweringOpt(TLO);
30646 // Only Cond (rather than other nodes in the computation chain) was
30647 // changed. Change the condition just for N to keep the opportunity to
30648 // optimize all other users their own way.
30649 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30650 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30655 // Look for vselects with LHS/RHS being bitcasted from an operation that
30656 // can be executed on another type. Push the bitcast to the inputs of
30657 // the operation. This exposes opportunities for using masking instructions.
30658 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30659 CondVT.getVectorElementType() == MVT::i1) {
30660 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30661 return SDValue(N, 0);
30662 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30663 return SDValue(N, 0);
30670 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30672 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30673 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30674 /// Note that this is only legal for some op/cc combinations.
30675 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30676 SelectionDAG &DAG) {
30677 // This combine only operates on CMP-like nodes.
30678 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30679 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30682 // Can't replace the cmp if it has more uses than the one we're looking at.
30683 // FIXME: We would like to be able to handle this, but would need to make sure
30684 // all uses were updated.
30685 if (!Cmp.hasOneUse())
30688 // This only applies to variations of the common case:
30689 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30690 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30691 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30692 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30693 // Using the proper condcodes (see below), overflow is checked for.
30695 // FIXME: We can generalize both constraints:
30696 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30698 // if the result is compared.
30700 SDValue CmpLHS = Cmp.getOperand(0);
30701 SDValue CmpRHS = Cmp.getOperand(1);
30703 if (!CmpLHS.hasOneUse())
30706 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30707 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30710 const unsigned Opc = CmpLHS.getOpcode();
30712 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30715 SDValue OpRHS = CmpLHS.getOperand(2);
30716 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30720 APInt Addend = OpRHSC->getAPIntValue();
30721 if (Opc == ISD::ATOMIC_LOAD_SUB)
30724 if (CC == X86::COND_S && Addend == 1)
30726 else if (CC == X86::COND_NS && Addend == 1)
30728 else if (CC == X86::COND_G && Addend == -1)
30730 else if (CC == X86::COND_LE && Addend == -1)
30735 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30736 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30737 DAG.getUNDEF(CmpLHS.getValueType()));
30738 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30742 // Check whether a boolean test is testing a boolean value generated by
30743 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30746 // Simplify the following patterns:
30747 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30748 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30749 // to (Op EFLAGS Cond)
30751 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30752 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30753 // to (Op EFLAGS !Cond)
30755 // where Op could be BRCOND or CMOV.
30757 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30758 // This combine only operates on CMP-like nodes.
30759 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30760 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30763 // Quit if not used as a boolean value.
30764 if (CC != X86::COND_E && CC != X86::COND_NE)
30767 // Check CMP operands. One of them should be 0 or 1 and the other should be
30768 // an SetCC or extended from it.
30769 SDValue Op1 = Cmp.getOperand(0);
30770 SDValue Op2 = Cmp.getOperand(1);
30773 const ConstantSDNode* C = nullptr;
30774 bool needOppositeCond = (CC == X86::COND_E);
30775 bool checkAgainstTrue = false; // Is it a comparison against 1?
30777 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30779 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30781 else // Quit if all operands are not constants.
30784 if (C->getZExtValue() == 1) {
30785 needOppositeCond = !needOppositeCond;
30786 checkAgainstTrue = true;
30787 } else if (C->getZExtValue() != 0)
30788 // Quit if the constant is neither 0 or 1.
30791 bool truncatedToBoolWithAnd = false;
30792 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30793 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30794 SetCC.getOpcode() == ISD::TRUNCATE ||
30795 SetCC.getOpcode() == ISD::AND) {
30796 if (SetCC.getOpcode() == ISD::AND) {
30798 if (isOneConstant(SetCC.getOperand(0)))
30800 if (isOneConstant(SetCC.getOperand(1)))
30804 SetCC = SetCC.getOperand(OpIdx);
30805 truncatedToBoolWithAnd = true;
30807 SetCC = SetCC.getOperand(0);
30810 switch (SetCC.getOpcode()) {
30811 case X86ISD::SETCC_CARRY:
30812 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30813 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30814 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30815 // truncated to i1 using 'and'.
30816 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30818 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30819 "Invalid use of SETCC_CARRY!");
30821 case X86ISD::SETCC:
30822 // Set the condition code or opposite one if necessary.
30823 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30824 if (needOppositeCond)
30825 CC = X86::GetOppositeBranchCondition(CC);
30826 return SetCC.getOperand(1);
30827 case X86ISD::CMOV: {
30828 // Check whether false/true value has canonical one, i.e. 0 or 1.
30829 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30830 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30831 // Quit if true value is not a constant.
30834 // Quit if false value is not a constant.
30836 SDValue Op = SetCC.getOperand(0);
30837 // Skip 'zext' or 'trunc' node.
30838 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30839 Op.getOpcode() == ISD::TRUNCATE)
30840 Op = Op.getOperand(0);
30841 // A special case for rdrand/rdseed, where 0 is set if false cond is
30843 if ((Op.getOpcode() != X86ISD::RDRAND &&
30844 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30847 // Quit if false value is not the constant 0 or 1.
30848 bool FValIsFalse = true;
30849 if (FVal && FVal->getZExtValue() != 0) {
30850 if (FVal->getZExtValue() != 1)
30852 // If FVal is 1, opposite cond is needed.
30853 needOppositeCond = !needOppositeCond;
30854 FValIsFalse = false;
30856 // Quit if TVal is not the constant opposite of FVal.
30857 if (FValIsFalse && TVal->getZExtValue() != 1)
30859 if (!FValIsFalse && TVal->getZExtValue() != 0)
30861 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30862 if (needOppositeCond)
30863 CC = X86::GetOppositeBranchCondition(CC);
30864 return SetCC.getOperand(3);
30871 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30873 /// (X86or (X86setcc) (X86setcc))
30874 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30875 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30876 X86::CondCode &CC1, SDValue &Flags,
30878 if (Cond->getOpcode() == X86ISD::CMP) {
30879 if (!isNullConstant(Cond->getOperand(1)))
30882 Cond = Cond->getOperand(0);
30887 SDValue SetCC0, SetCC1;
30888 switch (Cond->getOpcode()) {
30889 default: return false;
30896 SetCC0 = Cond->getOperand(0);
30897 SetCC1 = Cond->getOperand(1);
30901 // Make sure we have SETCC nodes, using the same flags value.
30902 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30903 SetCC1.getOpcode() != X86ISD::SETCC ||
30904 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30907 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30908 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30909 Flags = SetCC0->getOperand(1);
30913 /// Optimize an EFLAGS definition used according to the condition code \p CC
30914 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30915 /// uses of chain values.
30916 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30917 SelectionDAG &DAG) {
30918 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30920 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30923 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30924 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30925 TargetLowering::DAGCombinerInfo &DCI,
30926 const X86Subtarget &Subtarget) {
30929 // If the flag operand isn't dead, don't touch this CMOV.
30930 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
30933 SDValue FalseOp = N->getOperand(0);
30934 SDValue TrueOp = N->getOperand(1);
30935 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
30936 SDValue Cond = N->getOperand(3);
30938 if (CC == X86::COND_E || CC == X86::COND_NE) {
30939 switch (Cond.getOpcode()) {
30943 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
30944 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
30945 return (CC == X86::COND_E) ? FalseOp : TrueOp;
30949 // Try to simplify the EFLAGS and condition code operands.
30950 // We can't always do this as FCMOV only supports a subset of X86 cond.
30951 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
30952 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
30953 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
30955 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
30959 // If this is a select between two integer constants, try to do some
30960 // optimizations. Note that the operands are ordered the opposite of SELECT
30962 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
30963 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
30964 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
30965 // larger than FalseC (the false value).
30966 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
30967 CC = X86::GetOppositeBranchCondition(CC);
30968 std::swap(TrueC, FalseC);
30969 std::swap(TrueOp, FalseOp);
30972 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
30973 // This is efficient for any integer data type (including i8/i16) and
30975 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30976 Cond = getSETCC(CC, Cond, DL, DAG);
30978 // Zero extend the condition if needed.
30979 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
30981 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30982 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
30983 DAG.getConstant(ShAmt, DL, MVT::i8));
30984 if (N->getNumValues() == 2) // Dead flag value?
30985 return DCI.CombineTo(N, Cond, SDValue());
30989 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
30990 // for any integer data type, including i8/i16.
30991 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
30992 Cond = getSETCC(CC, Cond, DL, DAG);
30994 // Zero extend the condition if needed.
30995 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
30996 FalseC->getValueType(0), Cond);
30997 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30998 SDValue(FalseC, 0));
31000 if (N->getNumValues() == 2) // Dead flag value?
31001 return DCI.CombineTo(N, Cond, SDValue());
31005 // Optimize cases that will turn into an LEA instruction. This requires
31006 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31007 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31008 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31009 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31011 bool isFastMultiplier = false;
31013 switch ((unsigned char)Diff) {
31015 case 1: // result = add base, cond
31016 case 2: // result = lea base( , cond*2)
31017 case 3: // result = lea base(cond, cond*2)
31018 case 4: // result = lea base( , cond*4)
31019 case 5: // result = lea base(cond, cond*4)
31020 case 8: // result = lea base( , cond*8)
31021 case 9: // result = lea base(cond, cond*8)
31022 isFastMultiplier = true;
31027 if (isFastMultiplier) {
31028 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31029 Cond = getSETCC(CC, Cond, DL ,DAG);
31030 // Zero extend the condition if needed.
31031 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31033 // Scale the condition by the difference.
31035 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31036 DAG.getConstant(Diff, DL, Cond.getValueType()));
31038 // Add the base if non-zero.
31039 if (FalseC->getAPIntValue() != 0)
31040 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31041 SDValue(FalseC, 0));
31042 if (N->getNumValues() == 2) // Dead flag value?
31043 return DCI.CombineTo(N, Cond, SDValue());
31050 // Handle these cases:
31051 // (select (x != c), e, c) -> select (x != c), e, x),
31052 // (select (x == c), c, e) -> select (x == c), x, e)
31053 // where the c is an integer constant, and the "select" is the combination
31054 // of CMOV and CMP.
31056 // The rationale for this change is that the conditional-move from a constant
31057 // needs two instructions, however, conditional-move from a register needs
31058 // only one instruction.
31060 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31061 // some instruction-combining opportunities. This opt needs to be
31062 // postponed as late as possible.
31064 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31065 // the DCI.xxxx conditions are provided to postpone the optimization as
31066 // late as possible.
31068 ConstantSDNode *CmpAgainst = nullptr;
31069 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31070 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31071 !isa<ConstantSDNode>(Cond.getOperand(0))) {
31073 if (CC == X86::COND_NE &&
31074 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31075 CC = X86::GetOppositeBranchCondition(CC);
31076 std::swap(TrueOp, FalseOp);
31079 if (CC == X86::COND_E &&
31080 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31081 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31082 DAG.getConstant(CC, DL, MVT::i8), Cond };
31083 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
31088 // Fold and/or of setcc's to double CMOV:
31089 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31090 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31092 // This combine lets us generate:
31093 // cmovcc1 (jcc1 if we don't have CMOV)
31099 // cmovne (jne if we don't have CMOV)
31100 // When we can't use the CMOV instruction, it might increase branch
31102 // When we can use CMOV, or when there is no mispredict, this improves
31103 // throughput and reduces register pressure.
31105 if (CC == X86::COND_NE) {
31107 X86::CondCode CC0, CC1;
31109 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31111 std::swap(FalseOp, TrueOp);
31112 CC0 = X86::GetOppositeBranchCondition(CC0);
31113 CC1 = X86::GetOppositeBranchCondition(CC1);
31116 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31118 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31119 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31120 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31121 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31129 /// Different mul shrinking modes.
31130 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31132 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31133 EVT VT = N->getOperand(0).getValueType();
31134 if (VT.getScalarSizeInBits() != 32)
31137 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31138 unsigned SignBits[2] = {1, 1};
31139 bool IsPositive[2] = {false, false};
31140 for (unsigned i = 0; i < 2; i++) {
31141 SDValue Opd = N->getOperand(i);
31143 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31144 // compute signbits for it separately.
31145 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31146 // For anyextend, it is safe to assume an appropriate number of leading
31148 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31150 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31155 IsPositive[i] = true;
31156 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31157 // All the operands of BUILD_VECTOR need to be int constant.
31158 // Find the smallest value range which all the operands belong to.
31160 IsPositive[i] = true;
31161 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31162 if (SubOp.isUndef())
31164 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31167 APInt IntVal = CN->getAPIntValue();
31168 if (IntVal.isNegative())
31169 IsPositive[i] = false;
31170 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31173 SignBits[i] = DAG.ComputeNumSignBits(Opd);
31174 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31175 IsPositive[i] = true;
31179 bool AllPositive = IsPositive[0] && IsPositive[1];
31180 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31181 // When ranges are from -128 ~ 127, use MULS8 mode.
31182 if (MinSignBits >= 25)
31184 // When ranges are from 0 ~ 255, use MULU8 mode.
31185 else if (AllPositive && MinSignBits >= 24)
31187 // When ranges are from -32768 ~ 32767, use MULS16 mode.
31188 else if (MinSignBits >= 17)
31190 // When ranges are from 0 ~ 65535, use MULU16 mode.
31191 else if (AllPositive && MinSignBits >= 16)
31198 /// When the operands of vector mul are extended from smaller size values,
31199 /// like i8 and i16, the type of mul may be shrinked to generate more
31200 /// efficient code. Two typical patterns are handled:
31202 /// %2 = sext/zext <N x i8> %1 to <N x i32>
31203 /// %4 = sext/zext <N x i8> %3 to <N x i32>
31204 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31205 /// %5 = mul <N x i32> %2, %4
31208 /// %2 = zext/sext <N x i16> %1 to <N x i32>
31209 /// %4 = zext/sext <N x i16> %3 to <N x i32>
31210 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31211 /// %5 = mul <N x i32> %2, %4
31213 /// There are four mul shrinking modes:
31214 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31215 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31216 /// generate pmullw+sext32 for it (MULS8 mode).
31217 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31218 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31219 /// generate pmullw+zext32 for it (MULU8 mode).
31220 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31221 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31222 /// generate pmullw+pmulhw for it (MULS16 mode).
31223 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31224 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31225 /// generate pmullw+pmulhuw for it (MULU16 mode).
31226 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31227 const X86Subtarget &Subtarget) {
31228 // Check for legality
31229 // pmullw/pmulhw are not supported by SSE.
31230 if (!Subtarget.hasSSE2())
31233 // Check for profitability
31234 // pmulld is supported since SSE41. It is better to use pmulld
31235 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31237 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31238 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
31242 if (!canReduceVMulWidth(N, DAG, Mode))
31246 SDValue N0 = N->getOperand(0);
31247 SDValue N1 = N->getOperand(1);
31248 EVT VT = N->getOperand(0).getValueType();
31249 unsigned RegSize = 128;
31250 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31252 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31253 // Shrink the operands of mul.
31254 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31255 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31257 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
31258 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31259 // lower part is needed.
31260 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31261 if (Mode == MULU8 || Mode == MULS8) {
31262 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
31265 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31266 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31267 // the higher part is also needed.
31268 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31269 ReducedVT, NewN0, NewN1);
31271 // Repack the lower part and higher part result of mul into a wider
31273 // Generate shuffle functioning as punpcklwd.
31274 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31275 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31276 ShuffleMask[2 * i] = i;
31277 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31280 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31281 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31282 // Generate shuffle functioning as punpckhwd.
31283 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31284 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31285 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31288 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31289 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31290 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31293 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31294 // to legalize the mul explicitly because implicit legalization for type
31295 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31296 // instructions which will not exist when we explicitly legalize it by
31297 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31298 // <4 x i16> undef).
31300 // Legalize the operands of mul.
31301 // FIXME: We may be able to handle non-concatenated vectors by insertion.
31302 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31303 if ((RegSize % ReducedSizeInBits) != 0)
31306 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31307 DAG.getUNDEF(ReducedVT));
31309 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31311 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31313 if (Mode == MULU8 || Mode == MULS8) {
31314 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31316 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31318 // convert the type of mul result to VT.
31319 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31320 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31321 : ISD::SIGN_EXTEND_VECTOR_INREG,
31323 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31324 DAG.getIntPtrConstant(0, DL));
31326 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31327 // MULU16/MULS16, both parts are needed.
31328 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31329 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31330 OpsVT, NewN0, NewN1);
31332 // Repack the lower part and higher part result of mul into a wider
31333 // result. Make sure the type of mul result is VT.
31334 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31335 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31336 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31337 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31338 DAG.getIntPtrConstant(0, DL));
31343 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31344 EVT VT, SDLoc DL) {
31346 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31347 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31348 DAG.getConstant(Mult, DL, VT));
31349 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31350 DAG.getConstant(Shift, DL, MVT::i8));
31351 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31356 auto combineMulMulAddOrSub = [&](bool isAdd) {
31357 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31358 DAG.getConstant(9, DL, VT));
31359 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31360 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31369 // mul x, 11 => add ((shl (mul x, 5), 1), x)
31370 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
31372 // mul x, 21 => add ((shl (mul x, 5), 2), x)
31373 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
31375 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31376 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31377 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
31379 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
31380 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
31382 // mul x, 13 => add ((shl (mul x, 3), 2), x)
31383 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
31385 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
31386 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
31388 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31389 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31390 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
31392 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
31393 return combineMulMulAddOrSub(/*isAdd*/ false);
31395 // mul x, 28 => add ((mul (mul x, 9), 3), x)
31396 return combineMulMulAddOrSub(/*isAdd*/ true);
31398 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31399 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31400 combineMulMulAddOrSub(/*isAdd*/ true));
31402 // mul x, 30 => sub (sub ((shl x, 5), x), x)
31403 return DAG.getNode(
31405 DAG.getNode(ISD::SUB, DL, VT,
31406 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31407 DAG.getConstant(5, DL, MVT::i8)),
31414 /// Optimize a single multiply with constant into two operations in order to
31415 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31416 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31417 TargetLowering::DAGCombinerInfo &DCI,
31418 const X86Subtarget &Subtarget) {
31419 EVT VT = N->getValueType(0);
31420 if (DCI.isBeforeLegalize() && VT.isVector())
31421 return reduceVMULWidth(N, DAG, Subtarget);
31423 if (!MulConstantOptimization)
31425 // An imul is usually smaller than the alternative sequence.
31426 if (DAG.getMachineFunction().getFunction()->optForMinSize())
31429 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31432 if (VT != MVT::i64 && VT != MVT::i32)
31435 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31438 uint64_t MulAmt = C->getZExtValue();
31439 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31442 uint64_t MulAmt1 = 0;
31443 uint64_t MulAmt2 = 0;
31444 if ((MulAmt % 9) == 0) {
31446 MulAmt2 = MulAmt / 9;
31447 } else if ((MulAmt % 5) == 0) {
31449 MulAmt2 = MulAmt / 5;
31450 } else if ((MulAmt % 3) == 0) {
31452 MulAmt2 = MulAmt / 3;
31458 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31460 if (isPowerOf2_64(MulAmt2) &&
31461 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31462 // If second multiplifer is pow2, issue it first. We want the multiply by
31463 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31465 std::swap(MulAmt1, MulAmt2);
31467 if (isPowerOf2_64(MulAmt1))
31468 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31469 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31471 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31472 DAG.getConstant(MulAmt1, DL, VT));
31474 if (isPowerOf2_64(MulAmt2))
31475 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31476 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31478 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31479 DAG.getConstant(MulAmt2, DL, VT));
31480 } else if (!Subtarget.slowLEA())
31481 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31484 assert(MulAmt != 0 &&
31485 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31486 "Both cases that could cause potential overflows should have "
31487 "already been handled.");
31488 int64_t SignMulAmt = C->getSExtValue();
31489 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31490 (SignMulAmt != -INT64_MAX)) {
31491 int NumSign = SignMulAmt > 0 ? 1 : -1;
31492 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31493 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31494 if (IsPowerOf2_64PlusOne) {
31495 // (mul x, 2^N + 1) => (add (shl x, N), x)
31496 NewMul = DAG.getNode(
31497 ISD::ADD, DL, VT, N->getOperand(0),
31498 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31499 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31501 } else if (IsPowerOf2_64MinusOne) {
31502 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31503 NewMul = DAG.getNode(
31505 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31506 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31510 // To negate, subtract the number from zero
31511 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31513 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31518 // Do not add new nodes to DAG combiner worklist.
31519 DCI.CombineTo(N, NewMul, false);
31524 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31525 SDValue N0 = N->getOperand(0);
31526 SDValue N1 = N->getOperand(1);
31527 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31528 EVT VT = N0.getValueType();
31530 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31531 // since the result of setcc_c is all zero's or all ones.
31532 if (VT.isInteger() && !VT.isVector() &&
31533 N1C && N0.getOpcode() == ISD::AND &&
31534 N0.getOperand(1).getOpcode() == ISD::Constant) {
31535 SDValue N00 = N0.getOperand(0);
31536 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31537 Mask <<= N1C->getAPIntValue();
31538 bool MaskOK = false;
31539 // We can handle cases concerning bit-widening nodes containing setcc_c if
31540 // we carefully interrogate the mask to make sure we are semantics
31542 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31543 // of the underlying setcc_c operation if the setcc_c was zero extended.
31544 // Consider the following example:
31545 // zext(setcc_c) -> i32 0x0000FFFF
31546 // c1 -> i32 0x0000FFFF
31547 // c2 -> i32 0x00000001
31548 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31549 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31550 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31552 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31553 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31555 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31556 N00.getOpcode() == ISD::ANY_EXTEND) &&
31557 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31558 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31560 if (MaskOK && Mask != 0) {
31562 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31566 // Hardware support for vector shifts is sparse which makes us scalarize the
31567 // vector operations in many cases. Also, on sandybridge ADD is faster than
31569 // (shl V, 1) -> add V,V
31570 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31571 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31572 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31573 // We shift all of the values by one. In many cases we do not have
31574 // hardware support for this operation. This is better expressed as an ADD
31576 if (N1SplatC->getAPIntValue() == 1)
31577 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31583 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31584 SDValue N0 = N->getOperand(0);
31585 SDValue N1 = N->getOperand(1);
31586 EVT VT = N0.getValueType();
31587 unsigned Size = VT.getSizeInBits();
31589 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31590 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31591 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31592 // depending on sign of (SarConst - [56,48,32,24,16])
31594 // sexts in X86 are MOVs. The MOVs have the same code size
31595 // as above SHIFTs (only SHIFT on 1 has lower code size).
31596 // However the MOVs have 2 advantages to a SHIFT:
31597 // 1. MOVs can write to a register that differs from source
31598 // 2. MOVs accept memory operands
31600 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31601 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31602 N0.getOperand(1).getOpcode() != ISD::Constant)
31605 SDValue N00 = N0.getOperand(0);
31606 SDValue N01 = N0.getOperand(1);
31607 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31608 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31609 EVT CVT = N1.getValueType();
31611 if (SarConst.isNegative())
31614 for (MVT SVT : MVT::integer_valuetypes()) {
31615 unsigned ShiftSize = SVT.getSizeInBits();
31616 // skipping types without corresponding sext/zext and
31617 // ShlConst that is not one of [56,48,32,24,16]
31618 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31622 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31623 SarConst = SarConst - (Size - ShiftSize);
31626 else if (SarConst.isNegative())
31627 return DAG.getNode(ISD::SHL, DL, VT, NN,
31628 DAG.getConstant(-SarConst, DL, CVT));
31630 return DAG.getNode(ISD::SRA, DL, VT, NN,
31631 DAG.getConstant(SarConst, DL, CVT));
31636 /// \brief Returns a vector of 0s if the node in input is a vector logical
31637 /// shift by a constant amount which is known to be bigger than or equal
31638 /// to the vector element size in bits.
31639 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31640 const X86Subtarget &Subtarget) {
31641 EVT VT = N->getValueType(0);
31643 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31644 (!Subtarget.hasInt256() ||
31645 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31648 SDValue Amt = N->getOperand(1);
31650 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31651 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31652 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31653 unsigned MaxAmount =
31654 VT.getSimpleVT().getScalarSizeInBits();
31656 // SSE2/AVX2 logical shifts always return a vector of 0s
31657 // if the shift amount is bigger than or equal to
31658 // the element size. The constant shift amount will be
31659 // encoded as a 8-bit immediate.
31660 if (ShiftAmt.trunc(8).uge(MaxAmount))
31661 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31667 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31668 TargetLowering::DAGCombinerInfo &DCI,
31669 const X86Subtarget &Subtarget) {
31670 if (N->getOpcode() == ISD::SHL)
31671 if (SDValue V = combineShiftLeft(N, DAG))
31674 if (N->getOpcode() == ISD::SRA)
31675 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31678 // Try to fold this logical shift into a zero vector.
31679 if (N->getOpcode() != ISD::SRA)
31680 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31686 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31687 TargetLowering::DAGCombinerInfo &DCI,
31688 const X86Subtarget &Subtarget) {
31689 unsigned Opcode = N->getOpcode();
31690 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31691 X86ISD::VSRLI == Opcode) &&
31692 "Unexpected shift opcode");
31693 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31694 EVT VT = N->getValueType(0);
31695 SDValue N0 = N->getOperand(0);
31696 SDValue N1 = N->getOperand(1);
31697 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31698 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31699 "Unexpected value type");
31701 // Out of range logical bit shifts are guaranteed to be zero.
31702 // Out of range arithmetic bit shifts splat the sign bit.
31703 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31704 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31706 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31708 ShiftVal = NumBitsPerElt - 1;
31711 // Shift N0 by zero -> N0.
31715 // Shift zero -> zero.
31716 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31717 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31719 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31720 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31721 // TODO - support other sra opcodes as needed.
31722 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31723 N0.getOpcode() == X86ISD::VSRAI)
31724 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31726 // We can decode 'whole byte' logical bit shifts as shuffles.
31727 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31729 SmallVector<int, 1> NonceMask; // Just a placeholder.
31730 NonceMask.push_back(0);
31731 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31732 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31734 return SDValue(); // This routine will use CombineTo to replace N.
31737 // Constant Folding.
31739 SmallVector<APInt, 32> EltBits;
31740 if (N->isOnlyUserOf(N0.getNode()) &&
31741 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31742 assert(EltBits.size() == VT.getVectorNumElements() &&
31743 "Unexpected shift value type");
31744 unsigned ShiftImm = ShiftVal.getZExtValue();
31745 for (APInt &Elt : EltBits) {
31746 if (X86ISD::VSHLI == Opcode)
31748 else if (X86ISD::VSRAI == Opcode)
31749 Elt.ashrInPlace(ShiftImm);
31751 Elt.lshrInPlace(ShiftImm);
31753 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31759 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31760 TargetLowering::DAGCombinerInfo &DCI,
31761 const X86Subtarget &Subtarget) {
31763 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31764 (N->getOpcode() == X86ISD::PINSRW &&
31765 N->getValueType(0) == MVT::v8i16)) &&
31766 "Unexpected vector insertion");
31768 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31770 SmallVector<int, 1> NonceMask; // Just a placeholder.
31771 NonceMask.push_back(0);
31772 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31773 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31778 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31779 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31780 /// OR -> CMPNEQSS.
31781 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31782 TargetLowering::DAGCombinerInfo &DCI,
31783 const X86Subtarget &Subtarget) {
31786 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31787 // we're requiring SSE2 for both.
31788 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31789 SDValue N0 = N->getOperand(0);
31790 SDValue N1 = N->getOperand(1);
31791 SDValue CMP0 = N0->getOperand(1);
31792 SDValue CMP1 = N1->getOperand(1);
31795 // The SETCCs should both refer to the same CMP.
31796 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31799 SDValue CMP00 = CMP0->getOperand(0);
31800 SDValue CMP01 = CMP0->getOperand(1);
31801 EVT VT = CMP00.getValueType();
31803 if (VT == MVT::f32 || VT == MVT::f64) {
31804 bool ExpectingFlags = false;
31805 // Check for any users that want flags:
31806 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31807 !ExpectingFlags && UI != UE; ++UI)
31808 switch (UI->getOpcode()) {
31813 ExpectingFlags = true;
31815 case ISD::CopyToReg:
31816 case ISD::SIGN_EXTEND:
31817 case ISD::ZERO_EXTEND:
31818 case ISD::ANY_EXTEND:
31822 if (!ExpectingFlags) {
31823 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31824 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31826 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31827 X86::CondCode tmp = cc0;
31832 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31833 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31834 // FIXME: need symbolic constants for these magic numbers.
31835 // See X86ATTInstPrinter.cpp:printSSECC().
31836 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31837 if (Subtarget.hasAVX512()) {
31839 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31840 DAG.getConstant(x86cc, DL, MVT::i8));
31841 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31842 FSetCC, DAG.getIntPtrConstant(0, DL));
31844 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31845 CMP00.getValueType(), CMP00, CMP01,
31846 DAG.getConstant(x86cc, DL,
31849 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31850 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31852 if (is64BitFP && !Subtarget.is64Bit()) {
31853 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31854 // 64-bit integer, since that's not a legal type. Since
31855 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31856 // bits, but can do this little dance to extract the lowest 32 bits
31857 // and work with those going forward.
31858 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31860 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31861 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31862 Vector32, DAG.getIntPtrConstant(0, DL));
31866 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31867 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31868 DAG.getConstant(1, DL, IntVT));
31869 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31871 return OneBitOfTruth;
31879 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31880 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31881 assert(N->getOpcode() == ISD::AND);
31883 EVT VT = N->getValueType(0);
31884 SDValue N0 = N->getOperand(0);
31885 SDValue N1 = N->getOperand(1);
31888 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31891 if (N0.getOpcode() == ISD::XOR &&
31892 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31893 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31895 if (N1.getOpcode() == ISD::XOR &&
31896 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31897 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31902 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31903 // register. In most cases we actually compare or select YMM-sized registers
31904 // and mixing the two types creates horrible code. This method optimizes
31905 // some of the transition sequences.
31906 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31907 TargetLowering::DAGCombinerInfo &DCI,
31908 const X86Subtarget &Subtarget) {
31909 EVT VT = N->getValueType(0);
31910 if (!VT.is256BitVector())
31913 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31914 N->getOpcode() == ISD::ZERO_EXTEND ||
31915 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31917 SDValue Narrow = N->getOperand(0);
31918 EVT NarrowVT = Narrow->getValueType(0);
31919 if (!NarrowVT.is128BitVector())
31922 if (Narrow->getOpcode() != ISD::XOR &&
31923 Narrow->getOpcode() != ISD::AND &&
31924 Narrow->getOpcode() != ISD::OR)
31927 SDValue N0 = Narrow->getOperand(0);
31928 SDValue N1 = Narrow->getOperand(1);
31931 // The Left side has to be a trunc.
31932 if (N0.getOpcode() != ISD::TRUNCATE)
31935 // The type of the truncated inputs.
31936 EVT WideVT = N0->getOperand(0)->getValueType(0);
31940 // The right side has to be a 'trunc' or a constant vector.
31941 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
31942 ConstantSDNode *RHSConstSplat = nullptr;
31943 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
31944 RHSConstSplat = RHSBV->getConstantSplatNode();
31945 if (!RHSTrunc && !RHSConstSplat)
31948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
31950 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
31953 // Set N0 and N1 to hold the inputs to the new wide operation.
31954 N0 = N0->getOperand(0);
31955 if (RHSConstSplat) {
31956 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
31957 SDValue(RHSConstSplat, 0));
31958 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
31959 } else if (RHSTrunc) {
31960 N1 = N1->getOperand(0);
31963 // Generate the wide operation.
31964 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
31965 unsigned Opcode = N->getOpcode();
31967 case ISD::ANY_EXTEND:
31969 case ISD::ZERO_EXTEND: {
31970 unsigned InBits = NarrowVT.getScalarSizeInBits();
31971 APInt Mask = APInt::getAllOnesValue(InBits);
31972 Mask = Mask.zext(VT.getScalarSizeInBits());
31973 return DAG.getNode(ISD::AND, DL, VT,
31974 Op, DAG.getConstant(Mask, DL, VT));
31976 case ISD::SIGN_EXTEND:
31977 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
31978 Op, DAG.getValueType(NarrowVT));
31980 llvm_unreachable("Unexpected opcode");
31984 /// If both input operands of a logic op are being cast from floating point
31985 /// types, try to convert this into a floating point logic node to avoid
31986 /// unnecessary moves from SSE to integer registers.
31987 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
31988 const X86Subtarget &Subtarget) {
31989 unsigned FPOpcode = ISD::DELETED_NODE;
31990 if (N->getOpcode() == ISD::AND)
31991 FPOpcode = X86ISD::FAND;
31992 else if (N->getOpcode() == ISD::OR)
31993 FPOpcode = X86ISD::FOR;
31994 else if (N->getOpcode() == ISD::XOR)
31995 FPOpcode = X86ISD::FXOR;
31997 assert(FPOpcode != ISD::DELETED_NODE &&
31998 "Unexpected input node for FP logic conversion");
32000 EVT VT = N->getValueType(0);
32001 SDValue N0 = N->getOperand(0);
32002 SDValue N1 = N->getOperand(1);
32004 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
32005 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
32006 (Subtarget.hasSSE2() && VT == MVT::i64))) {
32007 SDValue N00 = N0.getOperand(0);
32008 SDValue N10 = N1.getOperand(0);
32009 EVT N00Type = N00.getValueType();
32010 EVT N10Type = N10.getValueType();
32011 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
32012 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32013 return DAG.getBitcast(VT, FPLogic);
32019 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
32020 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32021 /// with a shift-right to eliminate loading the vector constant mask value.
32022 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32023 const X86Subtarget &Subtarget) {
32024 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32025 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32026 EVT VT0 = Op0.getValueType();
32027 EVT VT1 = Op1.getValueType();
32029 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
32033 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
32034 !SplatVal.isMask())
32037 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
32040 unsigned EltBitWidth = VT0.getScalarSizeInBits();
32041 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32045 unsigned ShiftVal = SplatVal.countTrailingOnes();
32046 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32047 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32048 return DAG.getBitcast(N->getValueType(0), Shift);
32051 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32052 TargetLowering::DAGCombinerInfo &DCI,
32053 const X86Subtarget &Subtarget) {
32054 if (DCI.isBeforeLegalizeOps())
32057 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32060 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32063 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
32066 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
32069 EVT VT = N->getValueType(0);
32070 SDValue N0 = N->getOperand(0);
32071 SDValue N1 = N->getOperand(1);
32074 // Attempt to recursively combine a bitmask AND with shuffles.
32075 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
32077 SmallVector<int, 1> NonceMask; // Just a placeholder.
32078 NonceMask.push_back(0);
32079 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
32080 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
32082 return SDValue(); // This routine will use CombineTo to replace N.
32085 // Create BEXTR instructions
32086 // BEXTR is ((X >> imm) & (2**size-1))
32087 if (VT != MVT::i32 && VT != MVT::i64)
32090 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
32092 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
32095 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
32096 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32097 if (MaskNode && ShiftNode) {
32098 uint64_t Mask = MaskNode->getZExtValue();
32099 uint64_t Shift = ShiftNode->getZExtValue();
32100 if (isMask_64(Mask)) {
32101 uint64_t MaskSize = countPopulation(Mask);
32102 if (Shift + MaskSize <= VT.getSizeInBits())
32103 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
32104 DAG.getConstant(Shift | (MaskSize << 8), DL,
32112 // (or (and (m, y), (pandn m, x)))
32114 // (vselect m, x, y)
32115 // As a special case, try to fold:
32116 // (or (and (m, (sub 0, x)), (pandn m, x)))
32118 // (sub (xor X, M), M)
32119 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32120 const X86Subtarget &Subtarget) {
32121 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
32123 SDValue N0 = N->getOperand(0);
32124 SDValue N1 = N->getOperand(1);
32125 EVT VT = N->getValueType(0);
32127 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
32128 (VT.is256BitVector() && Subtarget.hasInt256())))
32131 // Canonicalize AND to LHS.
32132 if (N1.getOpcode() == ISD::AND)
32135 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32136 // ANDNP combine allows other combines to happen that prevent matching.
32137 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
32140 SDValue Mask = N1.getOperand(0);
32141 SDValue X = N1.getOperand(1);
32143 if (N0.getOperand(0) == Mask)
32144 Y = N0.getOperand(1);
32145 if (N0.getOperand(1) == Mask)
32146 Y = N0.getOperand(0);
32148 // Check to see if the mask appeared in both the AND and ANDNP.
32152 // Validate that X, Y, and Mask are bitcasts, and see through them.
32153 Mask = peekThroughBitcasts(Mask);
32154 X = peekThroughBitcasts(X);
32155 Y = peekThroughBitcasts(Y);
32157 EVT MaskVT = Mask.getValueType();
32158 unsigned EltBits = MaskVT.getScalarSizeInBits();
32160 // TODO: Attempt to handle floating point cases as well?
32161 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
32167 // (or (and (M, (sub 0, X)), (pandn M, X)))
32168 // which is a special case of vselect:
32169 // (vselect M, (sub 0, X), X)
32171 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32172 // We know that, if fNegate is 0 or 1:
32173 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32175 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32176 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32177 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
32178 // This lets us transform our vselect to:
32179 // (add (xor X, M), (and M, 1))
32181 // (sub (xor X, M), M)
32182 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
32183 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
32184 auto IsNegV = [](SDNode *N, SDValue V) {
32185 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32186 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32189 if (IsNegV(Y.getNode(), X))
32191 else if (IsNegV(X.getNode(), Y))
32195 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32196 SDValue SubOp2 = Mask;
32198 // If the negate was on the false side of the select, then
32199 // the operands of the SUB need to be swapped. PR 27251.
32200 // This is because the pattern being matched above is
32201 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
32202 // but if the pattern matched was
32203 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
32204 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
32205 // pattern also needs to be a negation of the replacement pattern above.
32206 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32207 // sub accomplishes the negation of the replacement pattern.
32209 std::swap(SubOp1, SubOp2);
32211 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32212 return DAG.getBitcast(VT, Res);
32216 // PBLENDVB is only available on SSE 4.1.
32217 if (!Subtarget.hasSSE41())
32220 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
32222 X = DAG.getBitcast(BlendVT, X);
32223 Y = DAG.getBitcast(BlendVT, Y);
32224 Mask = DAG.getBitcast(BlendVT, Mask);
32225 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32226 return DAG.getBitcast(VT, Mask);
32229 // Helper function for combineOrCmpEqZeroToCtlzSrl
32233 // srl(ctlz x), log2(bitsize(x))
32234 // Input pattern is checked by caller.
32235 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32236 SelectionDAG &DAG) {
32237 SDValue Cmp = Op.getOperand(1);
32238 EVT VT = Cmp.getOperand(0).getValueType();
32239 unsigned Log2b = Log2_32(VT.getSizeInBits());
32241 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32242 // The result of the shift is true or false, and on X86, the 32-bit
32243 // encoding of shr and lzcnt is more desirable.
32244 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32245 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32246 DAG.getConstant(Log2b, dl, VT));
32247 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32250 // Try to transform:
32251 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32253 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32254 // Will also attempt to match more generic cases, eg:
32255 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32256 // Only applies if the target supports the FastLZCNT feature.
32257 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32258 TargetLowering::DAGCombinerInfo &DCI,
32259 const X86Subtarget &Subtarget) {
32260 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
32263 auto isORCandidate = [](SDValue N) {
32264 return (N->getOpcode() == ISD::OR && N->hasOneUse());
32267 // Check the zero extend is extending to 32-bit or more. The code generated by
32268 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
32269 // instructions to clear the upper bits.
32270 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
32271 !isORCandidate(N->getOperand(0)))
32274 // Check the node matches: setcc(eq, cmp 0)
32275 auto isSetCCCandidate = [](SDValue N) {
32276 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32277 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32278 N->getOperand(1).getOpcode() == X86ISD::CMP &&
32279 isNullConstant(N->getOperand(1).getOperand(1)) &&
32280 N->getOperand(1).getValueType().bitsGE(MVT::i32);
32283 SDNode *OR = N->getOperand(0).getNode();
32284 SDValue LHS = OR->getOperand(0);
32285 SDValue RHS = OR->getOperand(1);
32287 // Save nodes matching or(or, setcc(eq, cmp 0)).
32288 SmallVector<SDNode *, 2> ORNodes;
32289 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
32290 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
32291 ORNodes.push_back(OR);
32292 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
32293 LHS = OR->getOperand(0);
32294 RHS = OR->getOperand(1);
32297 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32298 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
32299 !isORCandidate(SDValue(OR, 0)))
32302 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32304 // or(srl(ctlz),srl(ctlz)).
32305 // The dag combiner can then fold it into:
32306 // srl(or(ctlz, ctlz)).
32307 EVT VT = OR->getValueType(0);
32308 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32309 SDValue Ret, NewRHS;
32310 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
32311 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32316 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32317 while (ORNodes.size() > 0) {
32318 OR = ORNodes.pop_back_val();
32319 LHS = OR->getOperand(0);
32320 RHS = OR->getOperand(1);
32321 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32322 if (RHS->getOpcode() == ISD::OR)
32323 std::swap(LHS, RHS);
32324 EVT VT = OR->getValueType(0);
32325 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32328 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32332 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32337 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32338 TargetLowering::DAGCombinerInfo &DCI,
32339 const X86Subtarget &Subtarget) {
32340 if (DCI.isBeforeLegalizeOps())
32343 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32346 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32349 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32352 SDValue N0 = N->getOperand(0);
32353 SDValue N1 = N->getOperand(1);
32354 EVT VT = N->getValueType(0);
32356 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
32359 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
32360 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32362 // SHLD/SHRD instructions have lower register pressure, but on some
32363 // platforms they have higher latency than the equivalent
32364 // series of shifts/or that would otherwise be generated.
32365 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
32366 // have higher latencies and we are not optimizing for size.
32367 if (!OptForSize && Subtarget.isSHLDSlow())
32370 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
32372 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
32374 if (!N0.hasOneUse() || !N1.hasOneUse())
32377 SDValue ShAmt0 = N0.getOperand(1);
32378 if (ShAmt0.getValueType() != MVT::i8)
32380 SDValue ShAmt1 = N1.getOperand(1);
32381 if (ShAmt1.getValueType() != MVT::i8)
32383 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
32384 ShAmt0 = ShAmt0.getOperand(0);
32385 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32386 ShAmt1 = ShAmt1.getOperand(0);
32389 unsigned Opc = X86ISD::SHLD;
32390 SDValue Op0 = N0.getOperand(0);
32391 SDValue Op1 = N1.getOperand(0);
32392 if (ShAmt0.getOpcode() == ISD::SUB ||
32393 ShAmt0.getOpcode() == ISD::XOR) {
32394 Opc = X86ISD::SHRD;
32395 std::swap(Op0, Op1);
32396 std::swap(ShAmt0, ShAmt1);
32399 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32400 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32401 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32402 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32403 unsigned Bits = VT.getSizeInBits();
32404 if (ShAmt1.getOpcode() == ISD::SUB) {
32405 SDValue Sum = ShAmt1.getOperand(0);
32406 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
32407 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32408 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32409 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32410 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32411 return DAG.getNode(Opc, DL, VT,
32413 DAG.getNode(ISD::TRUNCATE, DL,
32416 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32417 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32418 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32419 return DAG.getNode(Opc, DL, VT,
32420 N0.getOperand(0), N1.getOperand(0),
32421 DAG.getNode(ISD::TRUNCATE, DL,
32423 } else if (ShAmt1.getOpcode() == ISD::XOR) {
32424 SDValue Mask = ShAmt1.getOperand(1);
32425 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32426 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32427 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32428 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32429 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32430 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32431 if (Op1.getOpcode() == InnerShift &&
32432 isa<ConstantSDNode>(Op1.getOperand(1)) &&
32433 Op1.getConstantOperandVal(1) == 1) {
32434 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32435 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32437 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32438 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32439 Op1.getOperand(0) == Op1.getOperand(1)) {
32440 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32441 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32450 /// Generate NEG and CMOV for integer abs.
32451 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32452 EVT VT = N->getValueType(0);
32454 // Since X86 does not have CMOV for 8-bit integer, we don't convert
32455 // 8-bit integer abs to NEG and CMOV.
32456 if (VT.isInteger() && VT.getSizeInBits() == 8)
32459 SDValue N0 = N->getOperand(0);
32460 SDValue N1 = N->getOperand(1);
32463 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32464 // and change it to SUB and CMOV.
32465 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32466 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32467 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32468 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32469 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32470 // Generate SUB & CMOV.
32471 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32472 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32473 SDValue Ops[] = {N0.getOperand(0), Neg,
32474 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32475 SDValue(Neg.getNode(), 1)};
32476 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32482 /// Try to turn tests against the signbit in the form of:
32483 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32486 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32487 // This is only worth doing if the output type is i8 or i1.
32488 EVT ResultType = N->getValueType(0);
32489 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32492 SDValue N0 = N->getOperand(0);
32493 SDValue N1 = N->getOperand(1);
32495 // We should be performing an xor against a truncated shift.
32496 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32499 // Make sure we are performing an xor against one.
32500 if (!isOneConstant(N1))
32503 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32504 SDValue Shift = N0.getOperand(0);
32505 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32508 // Make sure we are truncating from one of i16, i32 or i64.
32509 EVT ShiftTy = Shift.getValueType();
32510 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32513 // Make sure the shift amount extracts the sign bit.
32514 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32515 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32518 // Create a greater-than comparison against -1.
32519 // N.B. Using SETGE against 0 works but we want a canonical looking
32520 // comparison, using SETGT matches up with what TranslateX86CC.
32522 SDValue ShiftOp = Shift.getOperand(0);
32523 EVT ShiftOpTy = ShiftOp.getValueType();
32524 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32525 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32526 *DAG.getContext(), ResultType);
32527 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32528 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32529 if (SetCCResultType != ResultType)
32530 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32534 /// Turn vector tests of the signbit in the form of:
32535 /// xor (sra X, elt_size(X)-1), -1
32539 /// This should be called before type legalization because the pattern may not
32540 /// persist after that.
32541 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32542 const X86Subtarget &Subtarget) {
32543 EVT VT = N->getValueType(0);
32544 if (!VT.isSimple())
32547 switch (VT.getSimpleVT().SimpleTy) {
32548 default: return SDValue();
32551 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32552 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32556 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32559 // There must be a shift right algebraic before the xor, and the xor must be a
32560 // 'not' operation.
32561 SDValue Shift = N->getOperand(0);
32562 SDValue Ones = N->getOperand(1);
32563 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32564 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32567 // The shift should be smearing the sign bit across each vector element.
32568 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32572 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32573 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32574 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32577 // Create a greater-than comparison against -1. We don't use the more obvious
32578 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32579 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32582 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32583 /// is valid for the given \p Subtarget.
32584 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32585 const X86Subtarget &Subtarget) {
32586 if (!Subtarget.hasAVX512())
32589 // FIXME: Scalar type may be supported if we move it to vector register.
32590 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32593 EVT SrcElVT = SrcVT.getScalarType();
32594 EVT DstElVT = DstVT.getScalarType();
32595 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32597 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32599 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32600 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32604 /// Detect a pattern of truncation with saturation:
32605 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32606 /// Return the source value to be truncated or SDValue() if the pattern was not
32608 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32609 if (In.getOpcode() != ISD::UMIN)
32612 //Saturation with truncation. We truncate from InVT to VT.
32613 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32614 "Unexpected types for truncate operation");
32617 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
32618 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32619 // the element size of the destination type.
32620 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32626 /// Detect a pattern of truncation with saturation:
32627 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32628 /// The types should allow to use VPMOVUS* instruction on AVX512.
32629 /// Return the source value to be truncated or SDValue() if the pattern was not
32631 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32632 const X86Subtarget &Subtarget) {
32633 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32635 return detectUSatPattern(In, VT);
32639 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32640 const X86Subtarget &Subtarget) {
32641 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32642 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32644 if (auto USatVal = detectUSatPattern(In, VT))
32645 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32646 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32650 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32651 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32652 /// X86ISD::AVG instruction.
32653 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32654 const X86Subtarget &Subtarget,
32656 if (!VT.isVector() || !VT.isSimple())
32658 EVT InVT = In.getValueType();
32659 unsigned NumElems = VT.getVectorNumElements();
32661 EVT ScalarVT = VT.getVectorElementType();
32662 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32663 isPowerOf2_32(NumElems)))
32666 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32667 // than the original input type (i8/i16).
32668 EVT InScalarVT = InVT.getVectorElementType();
32669 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32672 if (!Subtarget.hasSSE2())
32674 if (Subtarget.hasBWI()) {
32675 if (VT.getSizeInBits() > 512)
32677 } else if (Subtarget.hasAVX2()) {
32678 if (VT.getSizeInBits() > 256)
32681 if (VT.getSizeInBits() > 128)
32685 // Detect the following pattern:
32687 // %1 = zext <N x i8> %a to <N x i32>
32688 // %2 = zext <N x i8> %b to <N x i32>
32689 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32690 // %4 = add nuw nsw <N x i32> %3, %2
32691 // %5 = lshr <N x i32> %N, <i32 1 x N>
32692 // %6 = trunc <N x i32> %5 to <N x i8>
32694 // In AVX512, the last instruction can also be a trunc store.
32696 if (In.getOpcode() != ISD::SRL)
32699 // A lambda checking the given SDValue is a constant vector and each element
32700 // is in the range [Min, Max].
32701 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32702 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32703 if (!BV || !BV->isConstant())
32705 for (SDValue Op : V->ops()) {
32706 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32709 uint64_t Val = C->getZExtValue();
32710 if (Val < Min || Val > Max)
32716 // Check if each element of the vector is left-shifted by one.
32717 auto LHS = In.getOperand(0);
32718 auto RHS = In.getOperand(1);
32719 if (!IsConstVectorInRange(RHS, 1, 1))
32721 if (LHS.getOpcode() != ISD::ADD)
32724 // Detect a pattern of a + b + 1 where the order doesn't matter.
32725 SDValue Operands[3];
32726 Operands[0] = LHS.getOperand(0);
32727 Operands[1] = LHS.getOperand(1);
32729 // Take care of the case when one of the operands is a constant vector whose
32730 // element is in the range [1, 256].
32731 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32732 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32733 Operands[0].getOperand(0).getValueType() == VT) {
32734 // The pattern is detected. Subtract one from the constant vector, then
32735 // demote it and emit X86ISD::AVG instruction.
32736 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32737 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32738 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32739 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32743 if (Operands[0].getOpcode() == ISD::ADD)
32744 std::swap(Operands[0], Operands[1]);
32745 else if (Operands[1].getOpcode() != ISD::ADD)
32747 Operands[2] = Operands[1].getOperand(0);
32748 Operands[1] = Operands[1].getOperand(1);
32750 // Now we have three operands of two additions. Check that one of them is a
32751 // constant vector with ones, and the other two are promoted from i8/i16.
32752 for (int i = 0; i < 3; ++i) {
32753 if (!IsConstVectorInRange(Operands[i], 1, 1))
32755 std::swap(Operands[i], Operands[2]);
32757 // Check if Operands[0] and Operands[1] are results of type promotion.
32758 for (int j = 0; j < 2; ++j)
32759 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32760 Operands[j].getOperand(0).getValueType() != VT)
32763 // The pattern is detected, emit X86ISD::AVG instruction.
32764 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32765 Operands[1].getOperand(0));
32771 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32772 TargetLowering::DAGCombinerInfo &DCI,
32773 const X86Subtarget &Subtarget) {
32774 LoadSDNode *Ld = cast<LoadSDNode>(N);
32775 EVT RegVT = Ld->getValueType(0);
32776 EVT MemVT = Ld->getMemoryVT();
32778 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32780 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32781 // into two 16-byte operations. Also split non-temporal aligned loads on
32782 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32783 ISD::LoadExtType Ext = Ld->getExtensionType();
32785 unsigned AddressSpace = Ld->getAddressSpace();
32786 unsigned Alignment = Ld->getAlignment();
32787 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32788 Ext == ISD::NON_EXTLOAD &&
32789 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
32790 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32791 AddressSpace, Alignment, &Fast) && !Fast))) {
32792 unsigned NumElems = RegVT.getVectorNumElements();
32796 SDValue Ptr = Ld->getBasePtr();
32798 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32801 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32802 Alignment, Ld->getMemOperand()->getFlags());
32804 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32806 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32807 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32808 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32810 Load2.getValue(1));
32812 SDValue NewVec = DAG.getUNDEF(RegVT);
32813 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32814 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32815 return DCI.CombineTo(N, NewVec, TF, true);
32821 /// If V is a build vector of boolean constants and exactly one of those
32822 /// constants is true, return the operand index of that true element.
32823 /// Otherwise, return -1.
32824 static int getOneTrueElt(SDValue V) {
32825 // This needs to be a build vector of booleans.
32826 // TODO: Checking for the i1 type matches the IR definition for the mask,
32827 // but the mask check could be loosened to i8 or other types. That might
32828 // also require checking more than 'allOnesValue'; eg, the x86 HW
32829 // instructions only require that the MSB is set for each mask element.
32830 // The ISD::MSTORE comments/definition do not specify how the mask operand
32832 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32833 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32836 int TrueIndex = -1;
32837 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32838 for (unsigned i = 0; i < NumElts; ++i) {
32839 const SDValue &Op = BV->getOperand(i);
32842 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32845 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32846 // If we already found a one, this is too many.
32847 if (TrueIndex >= 0)
32855 /// Given a masked memory load/store operation, return true if it has one mask
32856 /// bit set. If it has one mask bit set, then also return the memory address of
32857 /// the scalar element to load/store, the vector index to insert/extract that
32858 /// scalar element, and the alignment for the scalar memory access.
32859 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32860 SelectionDAG &DAG, SDValue &Addr,
32861 SDValue &Index, unsigned &Alignment) {
32862 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32863 if (TrueMaskElt < 0)
32866 // Get the address of the one scalar element that is specified by the mask
32867 // using the appropriate offset from the base pointer.
32868 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32869 Addr = MaskedOp->getBasePtr();
32870 if (TrueMaskElt != 0) {
32871 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32872 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32875 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32876 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32880 /// If exactly one element of the mask is set for a non-extending masked load,
32881 /// it is a scalar load and vector insert.
32882 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32883 /// mask have already been optimized in IR, so we don't bother with those here.
32885 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32886 TargetLowering::DAGCombinerInfo &DCI) {
32887 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32888 // However, some target hooks may need to be added to know when the transform
32889 // is profitable. Endianness would also have to be considered.
32891 SDValue Addr, VecIndex;
32892 unsigned Alignment;
32893 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32896 // Load the one scalar element that is specified by the mask using the
32897 // appropriate offset from the base pointer.
32899 EVT VT = ML->getValueType(0);
32900 EVT EltVT = VT.getVectorElementType();
32902 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32903 Alignment, ML->getMemOperand()->getFlags());
32905 // Insert the loaded element into the appropriate place in the vector.
32906 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32908 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32912 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32913 TargetLowering::DAGCombinerInfo &DCI) {
32914 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32918 EVT VT = ML->getValueType(0);
32920 // If we are loading the first and last elements of a vector, it is safe and
32921 // always faster to load the whole vector. Replace the masked load with a
32922 // vector load and select.
32923 unsigned NumElts = VT.getVectorNumElements();
32924 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
32925 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
32926 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
32927 if (LoadFirstElt && LoadLastElt) {
32928 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32929 ML->getMemOperand());
32930 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
32931 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
32934 // Convert a masked load with a constant mask into a masked load and a select.
32935 // This allows the select operation to use a faster kind of select instruction
32936 // (for example, vblendvps -> vblendps).
32938 // Don't try this if the pass-through operand is already undefined. That would
32939 // cause an infinite loop because that's what we're about to create.
32940 if (ML->getSrc0().isUndef())
32943 // The new masked load has an undef pass-through operand. The select uses the
32944 // original pass-through operand.
32945 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
32946 ML->getMask(), DAG.getUNDEF(VT),
32947 ML->getMemoryVT(), ML->getMemOperand(),
32948 ML->getExtensionType());
32949 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
32951 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
32954 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
32955 TargetLowering::DAGCombinerInfo &DCI,
32956 const X86Subtarget &Subtarget) {
32957 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
32959 // TODO: Expanding load with constant mask may be optimized as well.
32960 if (Mld->isExpandingLoad())
32963 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
32964 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
32966 // TODO: Do some AVX512 subsets benefit from this transform?
32967 if (!Subtarget.hasAVX512())
32968 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
32972 if (Mld->getExtensionType() != ISD::SEXTLOAD)
32975 // Resolve extending loads.
32976 EVT VT = Mld->getValueType(0);
32977 unsigned NumElems = VT.getVectorNumElements();
32978 EVT LdVT = Mld->getMemoryVT();
32981 assert(LdVT != VT && "Cannot extend to the same type");
32982 unsigned ToSz = VT.getScalarSizeInBits();
32983 unsigned FromSz = LdVT.getScalarSizeInBits();
32984 // From/To sizes and ElemCount must be pow of two.
32985 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
32986 "Unexpected size for extending masked load");
32988 unsigned SizeRatio = ToSz / FromSz;
32989 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
32991 // Create a type on which we perform the shuffle.
32992 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
32993 LdVT.getScalarType(), NumElems*SizeRatio);
32994 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
32996 // Convert Src0 value.
32997 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
32998 if (!Mld->getSrc0().isUndef()) {
32999 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33000 for (unsigned i = 0; i != NumElems; ++i)
33001 ShuffleVec[i] = i * SizeRatio;
33003 // Can't shuffle using an illegal type.
33004 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33005 "WideVecVT should be legal");
33006 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33007 DAG.getUNDEF(WideVecVT), ShuffleVec);
33009 // Prepare the new mask.
33011 SDValue Mask = Mld->getMask();
33012 if (Mask.getValueType() == VT) {
33013 // Mask and original value have the same type.
33014 NewMask = DAG.getBitcast(WideVecVT, Mask);
33015 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33016 for (unsigned i = 0; i != NumElems; ++i)
33017 ShuffleVec[i] = i * SizeRatio;
33018 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
33019 ShuffleVec[i] = NumElems * SizeRatio;
33020 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33021 DAG.getConstant(0, dl, WideVecVT),
33024 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33025 unsigned WidenNumElts = NumElems*SizeRatio;
33026 unsigned MaskNumElts = VT.getVectorNumElements();
33027 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33030 unsigned NumConcat = WidenNumElts / MaskNumElts;
33031 SmallVector<SDValue, 16> Ops(NumConcat);
33032 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33034 for (unsigned i = 1; i != NumConcat; ++i)
33037 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33040 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33041 Mld->getBasePtr(), NewMask, WideSrc0,
33042 Mld->getMemoryVT(), Mld->getMemOperand(),
33044 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33045 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33048 /// If exactly one element of the mask is set for a non-truncating masked store,
33049 /// it is a vector extract and scalar store.
33050 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33051 /// mask have already been optimized in IR, so we don't bother with those here.
33052 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33053 SelectionDAG &DAG) {
33054 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33055 // However, some target hooks may need to be added to know when the transform
33056 // is profitable. Endianness would also have to be considered.
33058 SDValue Addr, VecIndex;
33059 unsigned Alignment;
33060 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33063 // Extract the one scalar element that is actually being stored.
33065 EVT VT = MS->getValue().getValueType();
33066 EVT EltVT = VT.getVectorElementType();
33067 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33068 MS->getValue(), VecIndex);
33070 // Store that element at the appropriate offset from the base pointer.
33071 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33072 Alignment, MS->getMemOperand()->getFlags());
33075 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33076 const X86Subtarget &Subtarget) {
33077 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
33079 if (Mst->isCompressingStore())
33082 if (!Mst->isTruncatingStore())
33083 return reduceMaskedStoreToScalarStore(Mst, DAG);
33085 // Resolve truncating stores.
33086 EVT VT = Mst->getValue().getValueType();
33087 unsigned NumElems = VT.getVectorNumElements();
33088 EVT StVT = Mst->getMemoryVT();
33091 assert(StVT != VT && "Cannot truncate to the same type");
33092 unsigned FromSz = VT.getScalarSizeInBits();
33093 unsigned ToSz = StVT.getScalarSizeInBits();
33095 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33097 // The truncating store is legal in some cases. For example
33098 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33099 // are designated for truncate store.
33100 // In this case we don't need any further transformations.
33101 if (TLI.isTruncStoreLegal(VT, StVT))
33104 // From/To sizes and ElemCount must be pow of two.
33105 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33106 "Unexpected size for truncating masked store");
33107 // We are going to use the original vector elt for storing.
33108 // Accumulated smaller vector elements must be a multiple of the store size.
33109 assert (((NumElems * FromSz) % ToSz) == 0 &&
33110 "Unexpected ratio for truncating masked store");
33112 unsigned SizeRatio = FromSz / ToSz;
33113 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33115 // Create a type on which we perform the shuffle.
33116 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33117 StVT.getScalarType(), NumElems*SizeRatio);
33119 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33121 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33122 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33123 for (unsigned i = 0; i != NumElems; ++i)
33124 ShuffleVec[i] = i * SizeRatio;
33126 // Can't shuffle using an illegal type.
33127 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33128 "WideVecVT should be legal");
33130 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33131 DAG.getUNDEF(WideVecVT),
33135 SDValue Mask = Mst->getMask();
33136 if (Mask.getValueType() == VT) {
33137 // Mask and original value have the same type.
33138 NewMask = DAG.getBitcast(WideVecVT, Mask);
33139 for (unsigned i = 0; i != NumElems; ++i)
33140 ShuffleVec[i] = i * SizeRatio;
33141 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
33142 ShuffleVec[i] = NumElems*SizeRatio;
33143 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33144 DAG.getConstant(0, dl, WideVecVT),
33147 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33148 unsigned WidenNumElts = NumElems*SizeRatio;
33149 unsigned MaskNumElts = VT.getVectorNumElements();
33150 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33153 unsigned NumConcat = WidenNumElts / MaskNumElts;
33154 SmallVector<SDValue, 16> Ops(NumConcat);
33155 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33157 for (unsigned i = 1; i != NumConcat; ++i)
33160 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33163 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33164 Mst->getBasePtr(), NewMask, StVT,
33165 Mst->getMemOperand(), false);
33168 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33169 const X86Subtarget &Subtarget) {
33170 StoreSDNode *St = cast<StoreSDNode>(N);
33171 EVT VT = St->getValue().getValueType();
33172 EVT StVT = St->getMemoryVT();
33174 SDValue StoredVal = St->getOperand(1);
33175 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33177 // If we are saving a concatenation of two XMM registers and 32-byte stores
33178 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
33180 unsigned AddressSpace = St->getAddressSpace();
33181 unsigned Alignment = St->getAlignment();
33182 if (VT.is256BitVector() && StVT == VT &&
33183 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33184 AddressSpace, Alignment, &Fast) &&
33186 unsigned NumElems = VT.getVectorNumElements();
33190 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33191 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33193 SDValue Ptr0 = St->getBasePtr();
33194 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33197 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33198 Alignment, St->getMemOperand()->getFlags());
33200 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33201 std::min(16U, Alignment), St->getMemOperand()->getFlags());
33202 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33205 // Optimize trunc store (of multiple scalars) to shuffle and store.
33206 // First, pack all of the elements in one place. Next, store to memory
33207 // in fewer chunks.
33208 if (St->isTruncatingStore() && VT.isVector()) {
33209 // Check if we can detect an AVG pattern from the truncation. If yes,
33210 // replace the trunc store by a normal store with the result of X86ISD::AVG
33212 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33214 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33215 St->getPointerInfo(), St->getAlignment(),
33216 St->getMemOperand()->getFlags());
33219 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33220 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33221 dl, Val, St->getBasePtr(),
33222 St->getMemoryVT(), St->getMemOperand(), DAG);
33224 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33225 unsigned NumElems = VT.getVectorNumElements();
33226 assert(StVT != VT && "Cannot truncate to the same type");
33227 unsigned FromSz = VT.getScalarSizeInBits();
33228 unsigned ToSz = StVT.getScalarSizeInBits();
33230 // The truncating store is legal in some cases. For example
33231 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33232 // are designated for truncate store.
33233 // In this case we don't need any further transformations.
33234 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33237 // From, To sizes and ElemCount must be pow of two
33238 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
33239 // We are going to use the original vector elt for storing.
33240 // Accumulated smaller vector elements must be a multiple of the store size.
33241 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
33243 unsigned SizeRatio = FromSz / ToSz;
33245 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33247 // Create a type on which we perform the shuffle
33248 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33249 StVT.getScalarType(), NumElems*SizeRatio);
33251 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33253 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33254 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33255 for (unsigned i = 0; i != NumElems; ++i)
33256 ShuffleVec[i] = i * SizeRatio;
33258 // Can't shuffle using an illegal type.
33259 if (!TLI.isTypeLegal(WideVecVT))
33262 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33263 DAG.getUNDEF(WideVecVT),
33265 // At this point all of the data is stored at the bottom of the
33266 // register. We now need to save it to mem.
33268 // Find the largest store unit
33269 MVT StoreType = MVT::i8;
33270 for (MVT Tp : MVT::integer_valuetypes()) {
33271 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
33275 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33276 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
33277 (64 <= NumElems * ToSz))
33278 StoreType = MVT::f64;
33280 // Bitcast the original vector into a vector of store-size units
33281 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33282 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33283 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
33284 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33285 SmallVector<SDValue, 8> Chains;
33286 SDValue Ptr = St->getBasePtr();
33288 // Perform one or more big stores into memory.
33289 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
33290 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33291 StoreType, ShuffWide,
33292 DAG.getIntPtrConstant(i, dl));
33294 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33295 St->getAlignment(), St->getMemOperand()->getFlags());
33296 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33297 Chains.push_back(Ch);
33300 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33303 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
33304 // the FP state in cases where an emms may be missing.
33305 // A preferable solution to the general problem is to figure out the right
33306 // places to insert EMMS. This qualifies as a quick hack.
33308 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33309 if (VT.getSizeInBits() != 64)
33312 const Function *F = DAG.getMachineFunction().getFunction();
33313 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33315 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
33316 if ((VT.isVector() ||
33317 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
33318 isa<LoadSDNode>(St->getValue()) &&
33319 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
33320 St->getChain().hasOneUse() && !St->isVolatile()) {
33321 SDNode* LdVal = St->getValue().getNode();
33322 LoadSDNode *Ld = nullptr;
33323 int TokenFactorIndex = -1;
33324 SmallVector<SDValue, 8> Ops;
33325 SDNode* ChainVal = St->getChain().getNode();
33326 // Must be a store of a load. We currently handle two cases: the load
33327 // is a direct child, and it's under an intervening TokenFactor. It is
33328 // possible to dig deeper under nested TokenFactors.
33329 if (ChainVal == LdVal)
33330 Ld = cast<LoadSDNode>(St->getChain());
33331 else if (St->getValue().hasOneUse() &&
33332 ChainVal->getOpcode() == ISD::TokenFactor) {
33333 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
33334 if (ChainVal->getOperand(i).getNode() == LdVal) {
33335 TokenFactorIndex = i;
33336 Ld = cast<LoadSDNode>(St->getValue());
33338 Ops.push_back(ChainVal->getOperand(i));
33342 if (!Ld || !ISD::isNormalLoad(Ld))
33345 // If this is not the MMX case, i.e. we are just turning i64 load/store
33346 // into f64 load/store, avoid the transformation if there are multiple
33347 // uses of the loaded value.
33348 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
33353 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
33354 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33356 if (Subtarget.is64Bit() || F64IsLegal) {
33357 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
33358 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33359 Ld->getPointerInfo(), Ld->getAlignment(),
33360 Ld->getMemOperand()->getFlags());
33361 SDValue NewChain = NewLd.getValue(1);
33362 if (TokenFactorIndex >= 0) {
33363 Ops.push_back(NewChain);
33364 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33366 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33367 St->getPointerInfo(), St->getAlignment(),
33368 St->getMemOperand()->getFlags());
33371 // Otherwise, lower to two pairs of 32-bit loads / stores.
33372 SDValue LoAddr = Ld->getBasePtr();
33373 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33375 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33376 Ld->getPointerInfo(), Ld->getAlignment(),
33377 Ld->getMemOperand()->getFlags());
33378 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33379 Ld->getPointerInfo().getWithOffset(4),
33380 MinAlign(Ld->getAlignment(), 4),
33381 Ld->getMemOperand()->getFlags());
33383 SDValue NewChain = LoLd.getValue(1);
33384 if (TokenFactorIndex >= 0) {
33385 Ops.push_back(LoLd);
33386 Ops.push_back(HiLd);
33387 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33390 LoAddr = St->getBasePtr();
33391 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33394 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33395 St->getAlignment(), St->getMemOperand()->getFlags());
33396 SDValue HiSt = DAG.getStore(
33397 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33398 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33399 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33402 // This is similar to the above case, but here we handle a scalar 64-bit
33403 // integer store that is extracted from a vector on a 32-bit target.
33404 // If we have SSE2, then we can treat it like a floating-point double
33405 // to get past legalization. The execution dependencies fixup pass will
33406 // choose the optimal machine instruction for the store if this really is
33407 // an integer or v2f32 rather than an f64.
33408 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
33409 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
33410 SDValue OldExtract = St->getOperand(1);
33411 SDValue ExtOp0 = OldExtract.getOperand(0);
33412 unsigned VecSize = ExtOp0.getValueSizeInBits();
33413 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33414 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33415 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33416 BitCast, OldExtract.getOperand(1));
33417 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33418 St->getPointerInfo(), St->getAlignment(),
33419 St->getMemOperand()->getFlags());
33425 /// Return 'true' if this vector operation is "horizontal"
33426 /// and return the operands for the horizontal operation in LHS and RHS. A
33427 /// horizontal operation performs the binary operation on successive elements
33428 /// of its first operand, then on successive elements of its second operand,
33429 /// returning the resulting values in a vector. For example, if
33430 /// A = < float a0, float a1, float a2, float a3 >
33432 /// B = < float b0, float b1, float b2, float b3 >
33433 /// then the result of doing a horizontal operation on A and B is
33434 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33435 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33436 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33437 /// set to A, RHS to B, and the routine returns 'true'.
33438 /// Note that the binary operation should have the property that if one of the
33439 /// operands is UNDEF then the result is UNDEF.
33440 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33441 // Look for the following pattern: if
33442 // A = < float a0, float a1, float a2, float a3 >
33443 // B = < float b0, float b1, float b2, float b3 >
33445 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33446 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33447 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33448 // which is A horizontal-op B.
33450 // At least one of the operands should be a vector shuffle.
33451 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33452 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33455 MVT VT = LHS.getSimpleValueType();
33457 assert((VT.is128BitVector() || VT.is256BitVector()) &&
33458 "Unsupported vector type for horizontal add/sub");
33460 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33461 // operate independently on 128-bit lanes.
33462 unsigned NumElts = VT.getVectorNumElements();
33463 unsigned NumLanes = VT.getSizeInBits()/128;
33464 unsigned NumLaneElts = NumElts / NumLanes;
33465 assert((NumLaneElts % 2 == 0) &&
33466 "Vector type should have an even number of elements in each lane");
33467 unsigned HalfLaneElts = NumLaneElts/2;
33469 // View LHS in the form
33470 // LHS = VECTOR_SHUFFLE A, B, LMask
33471 // If LHS is not a shuffle then pretend it is the shuffle
33472 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33473 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33476 SmallVector<int, 16> LMask(NumElts);
33477 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33478 if (!LHS.getOperand(0).isUndef())
33479 A = LHS.getOperand(0);
33480 if (!LHS.getOperand(1).isUndef())
33481 B = LHS.getOperand(1);
33482 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33483 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33485 if (!LHS.isUndef())
33487 for (unsigned i = 0; i != NumElts; ++i)
33491 // Likewise, view RHS in the form
33492 // RHS = VECTOR_SHUFFLE C, D, RMask
33494 SmallVector<int, 16> RMask(NumElts);
33495 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33496 if (!RHS.getOperand(0).isUndef())
33497 C = RHS.getOperand(0);
33498 if (!RHS.getOperand(1).isUndef())
33499 D = RHS.getOperand(1);
33500 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33501 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33503 if (!RHS.isUndef())
33505 for (unsigned i = 0; i != NumElts; ++i)
33509 // Check that the shuffles are both shuffling the same vectors.
33510 if (!(A == C && B == D) && !(A == D && B == C))
33513 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33514 if (!A.getNode() && !B.getNode())
33517 // If A and B occur in reverse order in RHS, then "swap" them (which means
33518 // rewriting the mask).
33520 ShuffleVectorSDNode::commuteMask(RMask);
33522 // At this point LHS and RHS are equivalent to
33523 // LHS = VECTOR_SHUFFLE A, B, LMask
33524 // RHS = VECTOR_SHUFFLE A, B, RMask
33525 // Check that the masks correspond to performing a horizontal operation.
33526 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33527 for (unsigned i = 0; i != NumLaneElts; ++i) {
33528 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33530 // Ignore any UNDEF components.
33531 if (LIdx < 0 || RIdx < 0 ||
33532 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33533 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33536 // Check that successive elements are being operated on. If not, this is
33537 // not a horizontal operation.
33538 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33539 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33540 if (!(LIdx == Index && RIdx == Index + 1) &&
33541 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33546 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33547 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33551 /// Do target-specific dag combines on floating-point adds/subs.
33552 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33553 const X86Subtarget &Subtarget) {
33554 EVT VT = N->getValueType(0);
33555 SDValue LHS = N->getOperand(0);
33556 SDValue RHS = N->getOperand(1);
33557 bool IsFadd = N->getOpcode() == ISD::FADD;
33558 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33560 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33561 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33562 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33563 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33564 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33565 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33570 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33572 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33573 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33574 const X86Subtarget &Subtarget,
33576 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33577 SDValue Src = N->getOperand(0);
33578 unsigned Opcode = Src.getOpcode();
33579 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33581 EVT VT = N->getValueType(0);
33582 EVT SrcVT = Src.getValueType();
33584 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33585 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33587 // Repeated operand, so we are only trading one output truncation for
33588 // one input truncation.
33592 // See if either operand has been extended from a smaller/equal size to
33593 // the truncation size, allowing a truncation to combine with the extend.
33594 unsigned Opcode0 = Op0.getOpcode();
33595 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33596 Opcode0 == ISD::ZERO_EXTEND) &&
33597 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33600 unsigned Opcode1 = Op1.getOpcode();
33601 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33602 Opcode1 == ISD::ZERO_EXTEND) &&
33603 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33606 // See if either operand is a single use constant which can be constant
33608 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33609 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33610 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33611 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33614 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33615 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33616 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33617 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33620 // Don't combine if the operation has other uses.
33621 if (!N->isOnlyUserOf(Src.getNode()))
33624 // Only support vector truncation for now.
33625 // TODO: i64 scalar math would benefit as well.
33626 if (!VT.isVector())
33629 // In most cases its only worth pre-truncating if we're only facing the cost
33630 // of one truncation.
33631 // i.e. if one of the inputs will constant fold or the input is repeated.
33636 SDValue Op0 = Src.getOperand(0);
33637 SDValue Op1 = Src.getOperand(1);
33638 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33639 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33640 return TruncateArithmetic(Op0, Op1);
33645 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33646 // better to truncate if we have the chance.
33647 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33648 !TLI.isOperationLegal(Opcode, SrcVT))
33649 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33652 SDValue Op0 = Src.getOperand(0);
33653 SDValue Op1 = Src.getOperand(1);
33654 if (TLI.isOperationLegal(Opcode, VT) &&
33655 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33656 return TruncateArithmetic(Op0, Op1);
33664 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33666 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33667 SmallVector<SDValue, 8> &Regs) {
33668 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33669 Regs[0].getValueType() == MVT::v2i64));
33670 EVT OutVT = N->getValueType(0);
33671 EVT OutSVT = OutVT.getVectorElementType();
33672 EVT InVT = Regs[0].getValueType();
33673 EVT InSVT = InVT.getVectorElementType();
33676 // First, use mask to unset all bits that won't appear in the result.
33677 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33678 "OutSVT can only be either i8 or i16.");
33680 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33681 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33682 for (auto &Reg : Regs)
33683 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33685 MVT UnpackedVT, PackedVT;
33686 if (OutSVT == MVT::i8) {
33687 UnpackedVT = MVT::v8i16;
33688 PackedVT = MVT::v16i8;
33690 UnpackedVT = MVT::v4i32;
33691 PackedVT = MVT::v8i16;
33694 // In each iteration, truncate the type by a half size.
33695 auto RegNum = Regs.size();
33696 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33697 j < e; j *= 2, RegNum /= 2) {
33698 for (unsigned i = 0; i < RegNum; i++)
33699 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33700 for (unsigned i = 0; i < RegNum / 2; i++)
33701 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33705 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33706 // then extract a subvector as the result since v8i8 is not a legal type.
33707 if (OutVT == MVT::v8i8) {
33708 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33709 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33710 DAG.getIntPtrConstant(0, DL));
33712 } else if (RegNum > 1) {
33713 Regs.resize(RegNum);
33714 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33719 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33721 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33723 SmallVector<SDValue, 8> &Regs) {
33724 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33725 EVT OutVT = N->getValueType(0);
33728 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33729 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33730 for (auto &Reg : Regs) {
33731 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33733 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33737 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33738 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33741 if (Regs.size() > 2) {
33742 Regs.resize(Regs.size() / 2);
33743 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33748 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33749 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33750 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33751 /// element that is extracted from a vector and then truncated, and it is
33752 /// difficult to do this optimization based on them.
33753 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33754 const X86Subtarget &Subtarget) {
33755 EVT OutVT = N->getValueType(0);
33756 if (!OutVT.isVector())
33759 SDValue In = N->getOperand(0);
33760 if (!In.getValueType().isSimple())
33763 EVT InVT = In.getValueType();
33764 unsigned NumElems = OutVT.getVectorNumElements();
33766 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33767 // SSE2, and we need to take care of it specially.
33768 // AVX512 provides vpmovdb.
33769 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33772 EVT OutSVT = OutVT.getVectorElementType();
33773 EVT InSVT = InVT.getVectorElementType();
33774 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33775 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33779 // SSSE3's pshufb results in less instructions in the cases below.
33780 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33781 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33782 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33787 // Split a long vector into vectors of legal type.
33788 unsigned RegNum = InVT.getSizeInBits() / 128;
33789 SmallVector<SDValue, 8> SubVec(RegNum);
33790 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33791 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33793 for (unsigned i = 0; i < RegNum; i++)
33794 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33795 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33797 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33798 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33799 // truncate 2 x v4i32 to v8i16.
33800 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33801 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33802 else if (InSVT == MVT::i32)
33803 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33808 /// This function transforms vector truncation of 'all or none' bits values.
33809 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33810 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33812 const X86Subtarget &Subtarget) {
33813 // Requires SSE2 but AVX512 has fast truncate.
33814 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33817 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33820 SDValue In = N->getOperand(0);
33821 if (!In.getValueType().isSimple())
33824 MVT VT = N->getValueType(0).getSimpleVT();
33825 MVT SVT = VT.getScalarType();
33827 MVT InVT = In.getValueType().getSimpleVT();
33828 MVT InSVT = InVT.getScalarType();
33830 // Use PACKSS if the input is a splatted sign bit.
33831 // e.g. Comparison result, sext_in_reg, etc.
33832 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33833 if (NumSignBits != InSVT.getSizeInBits())
33836 // Check we have a truncation suited for PACKSS.
33837 if (!VT.is128BitVector() && !VT.is256BitVector())
33839 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33841 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33844 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33847 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33848 const X86Subtarget &Subtarget) {
33849 EVT VT = N->getValueType(0);
33850 SDValue Src = N->getOperand(0);
33853 // Attempt to pre-truncate inputs to arithmetic ops instead.
33854 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33857 // Try to detect AVG pattern first.
33858 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33861 // Try to combine truncation with unsigned saturation.
33862 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33865 // The bitcast source is a direct mmx result.
33866 // Detect bitcasts between i32 to x86mmx
33867 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33868 SDValue BCSrc = Src.getOperand(0);
33869 if (BCSrc.getValueType() == MVT::x86mmx)
33870 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33873 // Try to truncate extended sign bits with PACKSS.
33874 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33877 return combineVectorTruncation(N, DAG, Subtarget);
33880 /// Returns the negated value if the node \p N flips sign of FP value.
33882 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33883 /// AVX512F does not have FXOR, so FNEG is lowered as
33884 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33885 /// In this case we go though all bitcasts.
33886 static SDValue isFNEG(SDNode *N) {
33887 if (N->getOpcode() == ISD::FNEG)
33888 return N->getOperand(0);
33890 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33891 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33894 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33895 if (!Op1.getValueType().isFloatingPoint())
33898 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33900 unsigned EltBits = Op1.getScalarValueSizeInBits();
33901 auto isSignMask = [&](const ConstantFP *C) {
33902 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33905 // There is more than one way to represent the same constant on
33906 // the different X86 targets. The type of the node may also depend on size.
33907 // - load scalar value and broadcast
33908 // - BUILD_VECTOR node
33909 // - load from a constant pool.
33910 // We check all variants here.
33911 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33912 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33913 if (isSignMask(cast<ConstantFP>(C)))
33916 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33917 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33918 if (isSignMask(CN->getConstantFPValue()))
33921 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33922 if (C->getType()->isVectorTy()) {
33923 if (auto *SplatV = C->getSplatValue())
33924 if (isSignMask(cast<ConstantFP>(SplatV)))
33926 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
33927 if (isSignMask(FPConst))
33933 /// Do target-specific dag combines on floating point negations.
33934 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
33935 const X86Subtarget &Subtarget) {
33936 EVT OrigVT = N->getValueType(0);
33937 SDValue Arg = isFNEG(N);
33938 assert(Arg.getNode() && "N is expected to be an FNEG node");
33940 EVT VT = Arg.getValueType();
33941 EVT SVT = VT.getScalarType();
33944 // Let legalize expand this if it isn't a legal type yet.
33945 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
33948 // If we're negating a FMUL node on a target with FMA, then we can avoid the
33949 // use of a constant by performing (-0 - A*B) instead.
33950 // FIXME: Check rounding control flags as well once it becomes available.
33951 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
33952 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
33953 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
33954 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
33955 Arg.getOperand(1), Zero);
33956 return DAG.getBitcast(OrigVT, NewNode);
33959 // If we're negating an FMA node, then we can adjust the
33960 // instruction to include the extra negation.
33961 unsigned NewOpcode = 0;
33962 if (Arg.hasOneUse()) {
33963 switch (Arg.getOpcode()) {
33964 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
33965 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
33966 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
33967 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
33968 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
33969 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
33970 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
33971 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
33972 // We can't handle scalar intrinsic node here because it would only
33973 // invert one element and not the whole vector. But we could try to handle
33974 // a negation of the lower element only.
33978 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
33979 Arg.getNode()->ops()));
33984 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
33985 const X86Subtarget &Subtarget) {
33986 MVT VT = N->getSimpleValueType(0);
33987 // If we have integer vector types available, use the integer opcodes.
33988 if (VT.isVector() && Subtarget.hasSSE2()) {
33991 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
33993 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
33994 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
33995 unsigned IntOpcode;
33996 switch (N->getOpcode()) {
33997 default: llvm_unreachable("Unexpected FP logic op");
33998 case X86ISD::FOR: IntOpcode = ISD::OR; break;
33999 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34000 case X86ISD::FAND: IntOpcode = ISD::AND; break;
34001 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34003 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34004 return DAG.getBitcast(VT, IntOp);
34009 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34010 TargetLowering::DAGCombinerInfo &DCI,
34011 const X86Subtarget &Subtarget) {
34012 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
34015 if (DCI.isBeforeLegalizeOps())
34018 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
34021 if (Subtarget.hasCMov())
34022 if (SDValue RV = combineIntegerAbs(N, DAG))
34025 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34029 return combineFneg(N, DAG, Subtarget);
34034 static bool isNullFPScalarOrVectorConst(SDValue V) {
34035 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
34038 /// If a value is a scalar FP zero or a vector FP zero (potentially including
34039 /// undefined elements), return a zero constant that may be used to fold away
34040 /// that value. In the case of a vector, the returned constant will not contain
34041 /// undefined elements even if the input parameter does. This makes it suitable
34042 /// to be used as a replacement operand with operations (eg, bitwise-and) where
34043 /// an undef should not propagate.
34044 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
34045 const X86Subtarget &Subtarget) {
34046 if (!isNullFPScalarOrVectorConst(V))
34049 if (V.getValueType().isVector())
34050 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
34055 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
34056 const X86Subtarget &Subtarget) {
34057 SDValue N0 = N->getOperand(0);
34058 SDValue N1 = N->getOperand(1);
34059 EVT VT = N->getValueType(0);
34062 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
34063 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
34064 (VT == MVT::f64 && Subtarget.hasSSE2())))
34067 auto isAllOnesConstantFP = [](SDValue V) {
34068 auto *C = dyn_cast<ConstantFPSDNode>(V);
34069 return C && C->getConstantFPValue()->isAllOnesValue();
34072 // fand (fxor X, -1), Y --> fandn X, Y
34073 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
34074 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
34076 // fand X, (fxor Y, -1) --> fandn Y, X
34077 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
34078 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
34083 /// Do target-specific dag combines on X86ISD::FAND nodes.
34084 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
34085 const X86Subtarget &Subtarget) {
34086 // FAND(0.0, x) -> 0.0
34087 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
34090 // FAND(x, 0.0) -> 0.0
34091 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34094 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
34097 return lowerX86FPLogicOp(N, DAG, Subtarget);
34100 /// Do target-specific dag combines on X86ISD::FANDN nodes.
34101 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
34102 const X86Subtarget &Subtarget) {
34103 // FANDN(0.0, x) -> x
34104 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34105 return N->getOperand(1);
34107 // FANDN(x, 0.0) -> 0.0
34108 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34111 return lowerX86FPLogicOp(N, DAG, Subtarget);
34114 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34115 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34116 const X86Subtarget &Subtarget) {
34117 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
34119 // F[X]OR(0.0, x) -> x
34120 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34121 return N->getOperand(1);
34123 // F[X]OR(x, 0.0) -> x
34124 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
34125 return N->getOperand(0);
34128 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
34131 return lowerX86FPLogicOp(N, DAG, Subtarget);
34134 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34135 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34136 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
34138 // Only perform optimizations if UnsafeMath is used.
34139 if (!DAG.getTarget().Options.UnsafeFPMath)
34142 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34143 // into FMINC and FMAXC, which are Commutative operations.
34144 unsigned NewOp = 0;
34145 switch (N->getOpcode()) {
34146 default: llvm_unreachable("unknown opcode");
34147 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
34148 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
34151 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34152 N->getOperand(0), N->getOperand(1));
34155 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34156 const X86Subtarget &Subtarget) {
34157 if (Subtarget.useSoftFloat())
34160 // TODO: Check for global or instruction-level "nnan". In that case, we
34161 // should be able to lower to FMAX/FMIN alone.
34162 // TODO: If an operand is already known to be a NaN or not a NaN, this
34163 // should be an optional swap and FMAX/FMIN.
34165 EVT VT = N->getValueType(0);
34166 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
34167 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
34168 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
34171 // This takes at least 3 instructions, so favor a library call when operating
34172 // on a scalar and minimizing code size.
34173 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
34176 SDValue Op0 = N->getOperand(0);
34177 SDValue Op1 = N->getOperand(1);
34179 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34180 DAG.getDataLayout(), *DAG.getContext(), VT);
34182 // There are 4 possibilities involving NaN inputs, and these are the required
34186 // ----------------
34187 // Num | Max | Op0 |
34188 // Op0 ----------------
34189 // NaN | Op1 | NaN |
34190 // ----------------
34192 // The SSE FP max/min instructions were not designed for this case, but rather
34194 // Min = Op1 < Op0 ? Op1 : Op0
34195 // Max = Op1 > Op0 ? Op1 : Op0
34197 // So they always return Op0 if either input is a NaN. However, we can still
34198 // use those instructions for fmaxnum by selecting away a NaN input.
34200 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
34201 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
34202 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34203 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34205 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34206 // are NaN, the NaN value of Op1 is the result.
34207 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34210 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
34211 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34212 TargetLowering::DAGCombinerInfo &DCI,
34213 const X86Subtarget &Subtarget) {
34214 // ANDNP(0, x) -> x
34215 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34216 return N->getOperand(1);
34218 // ANDNP(x, 0) -> 0
34219 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
34220 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34222 EVT VT = N->getValueType(0);
34224 // Attempt to recursively combine a bitmask ANDNP with shuffles.
34225 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34227 SmallVector<int, 1> NonceMask; // Just a placeholder.
34228 NonceMask.push_back(0);
34229 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
34230 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
34232 return SDValue(); // This routine will use CombineTo to replace N.
34238 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34239 TargetLowering::DAGCombinerInfo &DCI) {
34240 // BT ignores high bits in the bit index operand.
34241 SDValue Op1 = N->getOperand(1);
34242 if (Op1.hasOneUse()) {
34243 unsigned BitWidth = Op1.getValueSizeInBits();
34244 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34246 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34247 !DCI.isBeforeLegalizeOps());
34248 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34249 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
34250 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
34251 DCI.CommitTargetLoweringOpt(TLO);
34256 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34257 const X86Subtarget &Subtarget) {
34258 EVT VT = N->getValueType(0);
34259 if (!VT.isVector())
34262 SDValue N0 = N->getOperand(0);
34263 SDValue N1 = N->getOperand(1);
34264 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34267 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34268 // both SSE and AVX2 since there is no sign-extended shift right
34269 // operation on a vector with 64-bit elements.
34270 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34271 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34272 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
34273 N0.getOpcode() == ISD::SIGN_EXTEND)) {
34274 SDValue N00 = N0.getOperand(0);
34276 // EXTLOAD has a better solution on AVX2,
34277 // it may be replaced with X86ISD::VSEXT node.
34278 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
34279 if (!ISD::isNormalLoad(N00.getNode()))
34282 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
34283 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34285 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34291 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34292 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34293 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34294 /// opportunities to combine math ops, use an LEA, or use a complex addressing
34295 /// mode. This can eliminate extend, add, and shift instructions.
34296 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34297 const X86Subtarget &Subtarget) {
34298 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34299 Ext->getOpcode() != ISD::ZERO_EXTEND)
34302 // TODO: This should be valid for other integer types.
34303 EVT VT = Ext->getValueType(0);
34304 if (VT != MVT::i64)
34307 SDValue Add = Ext->getOperand(0);
34308 if (Add.getOpcode() != ISD::ADD)
34311 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34312 bool NSW = Add->getFlags().hasNoSignedWrap();
34313 bool NUW = Add->getFlags().hasNoUnsignedWrap();
34315 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34317 if ((Sext && !NSW) || (!Sext && !NUW))
34320 // Having a constant operand to the 'add' ensures that we are not increasing
34321 // the instruction count because the constant is extended for free below.
34322 // A constant operand can also become the displacement field of an LEA.
34323 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34327 // Don't make the 'add' bigger if there's no hope of combining it with some
34328 // other 'add' or 'shl' instruction.
34329 // TODO: It may be profitable to generate simpler LEA instructions in place
34330 // of single 'add' instructions, but the cost model for selecting an LEA
34331 // currently has a high threshold.
34332 bool HasLEAPotential = false;
34333 for (auto *User : Ext->uses()) {
34334 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
34335 HasLEAPotential = true;
34339 if (!HasLEAPotential)
34342 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
34343 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
34344 SDValue AddOp0 = Add.getOperand(0);
34345 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34346 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34348 // The wider add is guaranteed to not wrap because both operands are
34351 Flags.setNoSignedWrap(NSW);
34352 Flags.setNoUnsignedWrap(NUW);
34353 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34356 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34357 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34358 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34359 /// extends from AH (which we otherwise need to do contortions to access).
34360 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34361 SDValue N0 = N->getOperand(0);
34362 auto OpcodeN = N->getOpcode();
34363 auto OpcodeN0 = N0.getOpcode();
34364 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
34365 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
34368 EVT VT = N->getValueType(0);
34369 EVT InVT = N0.getValueType();
34370 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
34373 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34374 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34375 : X86ISD::UDIVREM8_ZEXT_HREG;
34376 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34378 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34379 return R.getValue(1);
34382 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34383 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34384 /// with UNDEFs) of the input to vectors of the same size as the target type
34385 /// which then extends the lowest elements.
34386 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34387 TargetLowering::DAGCombinerInfo &DCI,
34388 const X86Subtarget &Subtarget) {
34389 unsigned Opcode = N->getOpcode();
34390 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
34392 if (!DCI.isBeforeLegalizeOps())
34394 if (!Subtarget.hasSSE2())
34397 SDValue N0 = N->getOperand(0);
34398 EVT VT = N->getValueType(0);
34399 EVT SVT = VT.getScalarType();
34400 EVT InVT = N0.getValueType();
34401 EVT InSVT = InVT.getScalarType();
34403 // Input type must be a vector and we must be extending legal integer types.
34404 if (!VT.isVector())
34406 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
34408 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34411 // On AVX2+ targets, if the input/output types are both legal then we will be
34412 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34413 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34414 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34419 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34420 EVT InVT = N.getValueType();
34421 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34422 Size / InVT.getScalarSizeInBits());
34423 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34424 DAG.getUNDEF(InVT));
34426 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34429 // If target-size is less than 128-bits, extend to a type that would extend
34430 // to 128 bits, extend that and extract the original target vector.
34431 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34432 unsigned Scale = 128 / VT.getSizeInBits();
34434 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34435 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34436 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34437 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34438 DAG.getIntPtrConstant(0, DL));
34441 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34442 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34443 // Also use this if we don't have SSE41 to allow the legalizer do its job.
34444 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34445 (VT.is256BitVector() && Subtarget.hasInt256()) ||
34446 (VT.is512BitVector() && Subtarget.hasAVX512())) {
34447 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34448 return Opcode == ISD::SIGN_EXTEND
34449 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34450 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34453 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34454 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34455 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34456 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34457 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34459 SmallVector<SDValue, 8> Opnds;
34460 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34461 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34462 DAG.getIntPtrConstant(Offset, DL));
34463 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34464 SrcVec = Opcode == ISD::SIGN_EXTEND
34465 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34466 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34467 Opnds.push_back(SrcVec);
34469 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34472 // On pre-AVX2 targets, split into 128-bit nodes of
34473 // ISD::*_EXTEND_VECTOR_INREG.
34474 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34475 return SplitAndExtendInReg(128);
34477 // On pre-AVX512 targets, split into 256-bit nodes of
34478 // ISD::*_EXTEND_VECTOR_INREG.
34479 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34480 return SplitAndExtendInReg(256);
34485 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34486 TargetLowering::DAGCombinerInfo &DCI,
34487 const X86Subtarget &Subtarget) {
34488 SDValue N0 = N->getOperand(0);
34489 EVT VT = N->getValueType(0);
34490 EVT InVT = N0.getValueType();
34493 if (SDValue DivRem8 = getDivRem8(N, DAG))
34496 if (!DCI.isBeforeLegalizeOps()) {
34497 if (InVT == MVT::i1) {
34498 SDValue Zero = DAG.getConstant(0, DL, VT);
34499 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34500 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34505 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34506 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34507 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34508 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34509 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34510 // sext (xor Bool, -1) --> sub (zext Bool), 1
34511 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34512 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34515 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34518 if (Subtarget.hasAVX() && VT.is256BitVector())
34519 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34522 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34528 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34529 const X86Subtarget &Subtarget) {
34531 EVT VT = N->getValueType(0);
34533 // Let legalize expand this if it isn't a legal type yet.
34534 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34537 EVT ScalarVT = VT.getScalarType();
34538 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34541 SDValue A = N->getOperand(0);
34542 SDValue B = N->getOperand(1);
34543 SDValue C = N->getOperand(2);
34545 auto invertIfNegative = [](SDValue &V) {
34546 if (SDValue NegVal = isFNEG(V.getNode())) {
34553 // Do not convert the passthru input of scalar intrinsics.
34554 // FIXME: We could allow negations of the lower element only.
34555 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34556 bool NegB = invertIfNegative(B);
34557 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34559 // Negative multiplication when NegA xor NegB
34560 bool NegMul = (NegA != NegB);
34562 unsigned NewOpcode;
34564 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34566 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34569 if (N->getOpcode() == X86ISD::FMADD_RND) {
34570 switch (NewOpcode) {
34571 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34572 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34573 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34574 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34576 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34577 switch (NewOpcode) {
34578 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34579 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34580 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34581 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34583 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34584 switch (NewOpcode) {
34585 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34586 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34587 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34588 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34591 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34592 "Unexpected opcode!");
34593 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34596 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34599 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34600 TargetLowering::DAGCombinerInfo &DCI,
34601 const X86Subtarget &Subtarget) {
34602 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34603 // (and (i32 x86isd::setcc_carry), 1)
34604 // This eliminates the zext. This transformation is necessary because
34605 // ISD::SETCC is always legalized to i8.
34607 SDValue N0 = N->getOperand(0);
34608 EVT VT = N->getValueType(0);
34610 if (N0.getOpcode() == ISD::AND &&
34612 N0.getOperand(0).hasOneUse()) {
34613 SDValue N00 = N0.getOperand(0);
34614 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34615 if (!isOneConstant(N0.getOperand(1)))
34617 return DAG.getNode(ISD::AND, dl, VT,
34618 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34619 N00.getOperand(0), N00.getOperand(1)),
34620 DAG.getConstant(1, dl, VT));
34624 if (N0.getOpcode() == ISD::TRUNCATE &&
34626 N0.getOperand(0).hasOneUse()) {
34627 SDValue N00 = N0.getOperand(0);
34628 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34629 return DAG.getNode(ISD::AND, dl, VT,
34630 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34631 N00.getOperand(0), N00.getOperand(1)),
34632 DAG.getConstant(1, dl, VT));
34636 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34639 if (VT.is256BitVector())
34640 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34643 if (SDValue DivRem8 = getDivRem8(N, DAG))
34646 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34649 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34655 /// Try to map a 128-bit or larger integer comparison to vector instructions
34656 /// before type legalization splits it up into chunks.
34657 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34658 const X86Subtarget &Subtarget) {
34659 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34660 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34662 // We're looking for an oversized integer equality comparison, but ignore a
34663 // comparison with zero because that gets special treatment in EmitTest().
34664 SDValue X = SetCC->getOperand(0);
34665 SDValue Y = SetCC->getOperand(1);
34666 EVT OpVT = X.getValueType();
34667 unsigned OpSize = OpVT.getSizeInBits();
34668 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34671 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34672 // TODO: Add support for AVX-512.
34673 EVT VT = SetCC->getValueType(0);
34675 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34676 (OpSize == 256 && Subtarget.hasAVX2())) {
34677 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34678 SDValue VecX = DAG.getBitcast(VecVT, X);
34679 SDValue VecY = DAG.getBitcast(VecVT, Y);
34681 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34682 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34683 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34684 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34685 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34686 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34687 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34688 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34690 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34696 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34697 const X86Subtarget &Subtarget) {
34698 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34699 SDValue LHS = N->getOperand(0);
34700 SDValue RHS = N->getOperand(1);
34701 EVT VT = N->getValueType(0);
34704 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34705 EVT OpVT = LHS.getValueType();
34706 // 0-x == y --> x+y == 0
34707 // 0-x != y --> x+y != 0
34708 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34710 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34711 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34713 // x == 0-y --> x+y == 0
34714 // x != 0-y --> x+y != 0
34715 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34717 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34718 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34721 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34725 if (VT.getScalarType() == MVT::i1 &&
34726 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34728 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34729 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34730 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34732 if (!IsSEXT0 || !IsVZero1) {
34733 // Swap the operands and update the condition code.
34734 std::swap(LHS, RHS);
34735 CC = ISD::getSetCCSwappedOperands(CC);
34737 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34738 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34739 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34742 if (IsSEXT0 && IsVZero1) {
34743 assert(VT == LHS.getOperand(0).getValueType() &&
34744 "Uexpected operand type");
34745 if (CC == ISD::SETGT)
34746 return DAG.getConstant(0, DL, VT);
34747 if (CC == ISD::SETLE)
34748 return DAG.getConstant(1, DL, VT);
34749 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34750 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34752 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34753 "Unexpected condition code!");
34754 return LHS.getOperand(0);
34758 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34759 // to avoid scalarization via legalization because v4i32 is not a legal type.
34760 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34761 LHS.getValueType() == MVT::v4f32)
34762 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34767 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34769 // Gather and Scatter instructions use k-registers for masks. The type of
34770 // the masks is v*i1. So the mask will be truncated anyway.
34771 // The SIGN_EXTEND_INREG my be dropped.
34772 SDValue Mask = N->getOperand(2);
34773 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34774 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34775 NewOps[2] = Mask.getOperand(0);
34776 DAG.UpdateNodeOperands(N, NewOps);
34781 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34782 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34783 const X86Subtarget &Subtarget) {
34785 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34786 SDValue EFLAGS = N->getOperand(1);
34788 // Try to simplify the EFLAGS and condition code operands.
34789 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34790 return getSETCC(CC, Flags, DL, DAG);
34795 /// Optimize branch condition evaluation.
34796 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34797 const X86Subtarget &Subtarget) {
34799 SDValue EFLAGS = N->getOperand(3);
34800 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34802 // Try to simplify the EFLAGS and condition code operands.
34803 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34804 // RAUW them under us.
34805 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34806 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34807 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34808 N->getOperand(1), Cond, Flags);
34814 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34815 SelectionDAG &DAG) {
34816 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34817 // optimize away operation when it's from a constant.
34819 // The general transformation is:
34820 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34821 // AND(VECTOR_CMP(x,y), constant2)
34822 // constant2 = UNARYOP(constant)
34824 // Early exit if this isn't a vector operation, the operand of the
34825 // unary operation isn't a bitwise AND, or if the sizes of the operations
34826 // aren't the same.
34827 EVT VT = N->getValueType(0);
34828 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34829 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34830 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34833 // Now check that the other operand of the AND is a constant. We could
34834 // make the transformation for non-constant splats as well, but it's unclear
34835 // that would be a benefit as it would not eliminate any operations, just
34836 // perform one more step in scalar code before moving to the vector unit.
34837 if (BuildVectorSDNode *BV =
34838 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34839 // Bail out if the vector isn't a constant.
34840 if (!BV->isConstant())
34843 // Everything checks out. Build up the new and improved node.
34845 EVT IntVT = BV->getValueType(0);
34846 // Create a new constant of the appropriate type for the transformed
34848 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34849 // The AND node needs bitcasts to/from an integer vector type around it.
34850 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34851 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34852 N->getOperand(0)->getOperand(0), MaskConst);
34853 SDValue Res = DAG.getBitcast(VT, NewAnd);
34860 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34861 const X86Subtarget &Subtarget) {
34862 SDValue Op0 = N->getOperand(0);
34863 EVT VT = N->getValueType(0);
34864 EVT InVT = Op0.getValueType();
34865 EVT InSVT = InVT.getScalarType();
34866 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34868 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34869 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34870 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34872 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34873 InVT.getVectorNumElements());
34874 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34876 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34877 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34879 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34882 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34883 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34884 // the optimization here.
34885 if (DAG.SignBitIsZero(Op0))
34886 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34891 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34892 const X86Subtarget &Subtarget) {
34893 // First try to optimize away the conversion entirely when it's
34894 // conditionally from a constant. Vectors only.
34895 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34898 // Now move on to more general possibilities.
34899 SDValue Op0 = N->getOperand(0);
34900 EVT VT = N->getValueType(0);
34901 EVT InVT = Op0.getValueType();
34902 EVT InSVT = InVT.getScalarType();
34904 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34905 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34906 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34907 if (InVT.isVector() &&
34908 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34909 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34911 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34912 InVT.getVectorNumElements());
34913 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34914 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34917 // Without AVX512DQ we only support i64 to float scalar conversion. For both
34918 // vectors and scalars, see if we know that the upper bits are all the sign
34919 // bit, in which case we can truncate the input to i32 and convert from that.
34920 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
34921 unsigned BitWidth = InVT.getScalarSizeInBits();
34922 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
34923 if (NumSignBits >= (BitWidth - 31)) {
34924 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
34925 if (InVT.isVector())
34926 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
34927 InVT.getVectorNumElements());
34929 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
34930 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
34934 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
34935 // a 32-bit target where SSE doesn't support i64->FP operations.
34936 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
34937 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
34938 EVT LdVT = Ld->getValueType(0);
34940 // This transformation is not supported if the result type is f16 or f128.
34941 if (VT == MVT::f16 || VT == MVT::f128)
34944 if (!Ld->isVolatile() && !VT.isVector() &&
34945 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
34946 !Subtarget.is64Bit() && LdVT == MVT::i64) {
34947 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
34948 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
34949 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
34956 // Optimize RES, EFLAGS = X86ISD::ADD LHS, RHS
34957 static SDValue combineX86ADD(SDNode *N, SelectionDAG &DAG,
34958 X86TargetLowering::DAGCombinerInfo &DCI) {
34959 // When legalizing carry, we create carries via add X, -1
34960 // If that comes from an actual carry, via setcc, we use the
34962 if (isAllOnesConstant(N->getOperand(1)) && N->hasAnyUseOfValue(1)) {
34963 SDValue Carry = N->getOperand(0);
34964 while (Carry.getOpcode() == ISD::TRUNCATE ||
34965 Carry.getOpcode() == ISD::ZERO_EXTEND ||
34966 Carry.getOpcode() == ISD::SIGN_EXTEND ||
34967 Carry.getOpcode() == ISD::ANY_EXTEND ||
34968 (Carry.getOpcode() == ISD::AND &&
34969 isOneConstant(Carry.getOperand(1))))
34970 Carry = Carry.getOperand(0);
34972 if (Carry.getOpcode() == X86ISD::SETCC ||
34973 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
34974 if (Carry.getConstantOperandVal(0) == X86::COND_B)
34975 return DCI.CombineTo(N, SDValue(N, 0), Carry.getOperand(1));
34982 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
34983 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
34984 X86TargetLowering::DAGCombinerInfo &DCI) {
34985 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
34986 // the result is either zero or one (depending on the input carry bit).
34987 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
34988 if (X86::isZeroNode(N->getOperand(0)) &&
34989 X86::isZeroNode(N->getOperand(1)) &&
34990 // We don't have a good way to replace an EFLAGS use, so only do this when
34992 SDValue(N, 1).use_empty()) {
34994 EVT VT = N->getValueType(0);
34995 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
34996 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
34997 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
34998 DAG.getConstant(X86::COND_B, DL,
35001 DAG.getConstant(1, DL, VT));
35002 return DCI.CombineTo(N, Res1, CarryOut);
35008 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
35009 /// which is more useful than 0/1 in some cases.
35010 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
35012 // "Condition code B" is also known as "the carry flag" (CF).
35013 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
35014 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
35015 MVT VT = N->getSimpleValueType(0);
35017 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
35019 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
35020 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
35023 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
35024 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
35025 /// with CMP+{ADC, SBB}.
35026 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
35027 bool IsSub = N->getOpcode() == ISD::SUB;
35028 SDValue X = N->getOperand(0);
35029 SDValue Y = N->getOperand(1);
35031 // If this is an add, canonicalize a zext operand to the RHS.
35032 // TODO: Incomplete? What if both sides are zexts?
35033 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
35034 Y.getOpcode() != ISD::ZERO_EXTEND)
35037 // Look through a one-use zext.
35038 bool PeekedThroughZext = false;
35039 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
35040 Y = Y.getOperand(0);
35041 PeekedThroughZext = true;
35044 // If this is an add, canonicalize a setcc operand to the RHS.
35045 // TODO: Incomplete? What if both sides are setcc?
35046 // TODO: Should we allow peeking through a zext of the other operand?
35047 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
35048 Y.getOpcode() != X86ISD::SETCC)
35051 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
35055 EVT VT = N->getValueType(0);
35056 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
35058 // If X is -1 or 0, then we have an opportunity to avoid constants required in
35059 // the general case below.
35060 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
35062 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
35063 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
35064 // This is a complicated way to get -1 or 0 from the carry flag:
35065 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35066 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35067 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35068 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35072 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
35073 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
35074 SDValue EFLAGS = Y->getOperand(1);
35075 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35076 EFLAGS.getValueType().isInteger() &&
35077 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35078 // Swap the operands of a SUB, and we have the same pattern as above.
35079 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
35080 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
35081 SDValue NewSub = DAG.getNode(
35082 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
35083 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35084 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35085 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35086 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35092 if (CC == X86::COND_B) {
35093 // X + SETB Z --> X + (mask SBB Z, Z)
35094 // X - SETB Z --> X - (mask SBB Z, Z)
35095 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
35096 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
35097 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35098 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35099 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35102 if (CC == X86::COND_A) {
35103 SDValue EFLAGS = Y->getOperand(1);
35104 // Try to convert COND_A into COND_B in an attempt to facilitate
35105 // materializing "setb reg".
35107 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
35108 // cannot take an immediate as its first operand.
35110 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35111 EFLAGS.getValueType().isInteger() &&
35112 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35113 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
35114 EFLAGS.getNode()->getVTList(),
35115 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35116 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35117 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
35118 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35119 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35120 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35124 if (CC != X86::COND_E && CC != X86::COND_NE)
35127 SDValue Cmp = Y.getOperand(1);
35128 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
35129 !X86::isZeroNode(Cmp.getOperand(1)) ||
35130 !Cmp.getOperand(0).getValueType().isInteger())
35133 SDValue Z = Cmp.getOperand(0);
35134 EVT ZVT = Z.getValueType();
35136 // If X is -1 or 0, then we have an opportunity to avoid constants required in
35137 // the general case below.
35139 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35141 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35142 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35143 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
35144 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
35145 SDValue Zero = DAG.getConstant(0, DL, ZVT);
35146 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35147 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35148 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35149 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35150 SDValue(Neg.getNode(), 1));
35153 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35154 // with fake operands:
35155 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35156 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35157 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
35158 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
35159 SDValue One = DAG.getConstant(1, DL, ZVT);
35160 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35161 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35162 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35166 // (cmp Z, 1) sets the carry flag if Z is 0.
35167 SDValue One = DAG.getConstant(1, DL, ZVT);
35168 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35170 // Add the flags type for ADC/SBB nodes.
35171 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35173 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35174 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35175 if (CC == X86::COND_NE)
35176 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
35177 DAG.getConstant(-1ULL, DL, VT), Cmp1);
35179 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
35180 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
35181 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
35182 DAG.getConstant(0, DL, VT), Cmp1);
35185 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35186 const X86Subtarget &Subtarget) {
35187 SDValue MulOp = N->getOperand(0);
35188 SDValue Phi = N->getOperand(1);
35190 if (MulOp.getOpcode() != ISD::MUL)
35191 std::swap(MulOp, Phi);
35192 if (MulOp.getOpcode() != ISD::MUL)
35196 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
35199 EVT VT = N->getValueType(0);
35201 unsigned RegSize = 128;
35202 if (Subtarget.hasBWI())
35204 else if (Subtarget.hasAVX2())
35206 unsigned VectorSize = VT.getVectorNumElements() * 16;
35207 // If the vector size is less than 128, or greater than the supported RegSize,
35208 // do not use PMADD.
35209 if (VectorSize < 128 || VectorSize > RegSize)
35213 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35214 VT.getVectorNumElements());
35215 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35216 VT.getVectorNumElements() / 2);
35218 // Shrink the operands of mul.
35219 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35220 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35222 // Madd vector size is half of the original vector size
35223 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35224 // Fill the rest of the output with 0
35225 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35226 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35227 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35230 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35231 const X86Subtarget &Subtarget) {
35233 EVT VT = N->getValueType(0);
35234 SDValue Op0 = N->getOperand(0);
35235 SDValue Op1 = N->getOperand(1);
35237 // TODO: There's nothing special about i32, any integer type above i16 should
35238 // work just as well.
35239 if (!VT.isVector() || !VT.isSimple() ||
35240 !(VT.getVectorElementType() == MVT::i32))
35243 unsigned RegSize = 128;
35244 if (Subtarget.hasBWI())
35246 else if (Subtarget.hasAVX2())
35249 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35250 // TODO: We should be able to handle larger vectors by splitting them before
35251 // feeding them into several SADs, and then reducing over those.
35252 if (VT.getSizeInBits() / 4 > RegSize)
35255 // We know N is a reduction add, which means one of its operands is a phi.
35256 // To match SAD, we need the other operand to be a vector select.
35257 SDValue SelectOp, Phi;
35258 if (Op0.getOpcode() == ISD::VSELECT) {
35261 } else if (Op1.getOpcode() == ISD::VSELECT) {
35267 // Check whether we have an abs-diff pattern feeding into the select.
35268 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
35271 // SAD pattern detected. Now build a SAD instruction and an addition for
35272 // reduction. Note that the number of elements of the result of SAD is less
35273 // than the number of elements of its input. Therefore, we could only update
35274 // part of elements in the reduction vector.
35275 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35277 // The output of PSADBW is a vector of i64.
35278 // We need to turn the vector of i64 into a vector of i32.
35279 // If the reduction vector is at least as wide as the psadbw result, just
35280 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35282 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35283 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35284 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35286 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35288 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
35289 // Update part of elements of the reduction vector. This is done by first
35290 // extracting a sub-vector from it, updating this sub-vector, and inserting
35292 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
35293 DAG.getIntPtrConstant(0, DL));
35294 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
35295 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
35296 DAG.getIntPtrConstant(0, DL));
35298 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35301 /// Convert vector increment or decrement to sub/add with an all-ones constant:
35302 /// add X, <1, 1...> --> sub X, <-1, -1...>
35303 /// sub X, <1, 1...> --> add X, <-1, -1...>
35304 /// The all-ones vector constant can be materialized using a pcmpeq instruction
35305 /// that is commonly recognized as an idiom (has no register dependency), so
35306 /// that's better/smaller than loading a splat 1 constant.
35307 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35308 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
35309 "Unexpected opcode for increment/decrement transform");
35311 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35312 // out and wait for legalization if we have an unsupported vector length.
35313 EVT VT = N->getValueType(0);
35314 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35317 SDNode *N1 = N->getOperand(1).getNode();
35319 if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue())
35322 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35323 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35324 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35327 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35328 const X86Subtarget &Subtarget) {
35329 const SDNodeFlags Flags = N->getFlags();
35330 if (Flags.hasVectorReduction()) {
35331 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35333 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
35336 EVT VT = N->getValueType(0);
35337 SDValue Op0 = N->getOperand(0);
35338 SDValue Op1 = N->getOperand(1);
35340 // Try to synthesize horizontal adds from adds of shuffles.
35341 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35342 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35343 isHorizontalBinOp(Op0, Op1, true))
35344 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35346 if (SDValue V = combineIncDecVector(N, DAG))
35349 return combineAddOrSubToADCOrSBB(N, DAG);
35352 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35353 const X86Subtarget &Subtarget) {
35354 SDValue Op0 = N->getOperand(0);
35355 SDValue Op1 = N->getOperand(1);
35357 // X86 can't encode an immediate LHS of a sub. See if we can push the
35358 // negation into a preceding instruction.
35359 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
35360 // If the RHS of the sub is a XOR with one use and a constant, invert the
35361 // immediate. Then add one to the LHS of the sub so we can turn
35362 // X-Y -> X+~Y+1, saving one register.
35363 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
35364 isa<ConstantSDNode>(Op1.getOperand(1))) {
35365 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35366 EVT VT = Op0.getValueType();
35367 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35369 DAG.getConstant(~XorC, SDLoc(Op1), VT));
35370 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35371 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35375 // Try to synthesize horizontal subs from subs of shuffles.
35376 EVT VT = N->getValueType(0);
35377 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35378 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35379 isHorizontalBinOp(Op0, Op1, false))
35380 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35382 if (SDValue V = combineIncDecVector(N, DAG))
35385 return combineAddOrSubToADCOrSBB(N, DAG);
35388 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35389 TargetLowering::DAGCombinerInfo &DCI,
35390 const X86Subtarget &Subtarget) {
35391 if (DCI.isBeforeLegalize())
35395 unsigned Opcode = N->getOpcode();
35396 MVT VT = N->getSimpleValueType(0);
35397 MVT SVT = VT.getVectorElementType();
35398 unsigned NumElts = VT.getVectorNumElements();
35399 unsigned EltSizeInBits = SVT.getSizeInBits();
35401 SDValue Op = N->getOperand(0);
35402 MVT OpVT = Op.getSimpleValueType();
35403 MVT OpEltVT = OpVT.getVectorElementType();
35404 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35405 unsigned InputBits = OpEltSizeInBits * NumElts;
35407 // Perform any constant folding.
35408 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35410 SmallVector<APInt, 64> EltBits;
35411 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
35412 APInt Undefs(NumElts, 0);
35413 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35415 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35416 for (unsigned i = 0; i != NumElts; ++i) {
35417 if (UndefElts[i]) {
35421 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
35422 : EltBits[i].sextOrTrunc(EltSizeInBits);
35424 return getConstVector(Vals, Undefs, VT, DAG, DL);
35427 // (vzext (bitcast (vzext (x)) -> (vzext x)
35428 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35429 SDValue V = peekThroughBitcasts(Op);
35430 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
35431 MVT InnerVT = V.getSimpleValueType();
35432 MVT InnerEltVT = InnerVT.getVectorElementType();
35434 // If the element sizes match exactly, we can just do one larger vzext. This
35435 // is always an exact type match as vzext operates on integer types.
35436 if (OpEltVT == InnerEltVT) {
35437 assert(OpVT == InnerVT && "Types must match for vzext!");
35438 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35441 // The only other way we can combine them is if only a single element of the
35442 // inner vzext is used in the input to the outer vzext.
35443 if (InnerEltVT.getSizeInBits() < InputBits)
35446 // In this case, the inner vzext is completely dead because we're going to
35447 // only look at bits inside of the low element. Just do the outer vzext on
35448 // a bitcast of the input to the inner.
35449 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35452 // Check if we can bypass extracting and re-inserting an element of an input
35453 // vector. Essentially:
35454 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35455 // TODO: Add X86ISD::VSEXT support
35456 if (Opcode == X86ISD::VZEXT &&
35457 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35458 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35459 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
35460 SDValue ExtractedV = V.getOperand(0);
35461 SDValue OrigV = ExtractedV.getOperand(0);
35462 if (isNullConstant(ExtractedV.getOperand(1))) {
35463 MVT OrigVT = OrigV.getSimpleValueType();
35464 // Extract a subvector if necessary...
35465 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
35466 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35467 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35468 OrigVT.getVectorNumElements() / Ratio);
35469 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35470 DAG.getIntPtrConstant(0, DL));
35472 Op = DAG.getBitcast(OpVT, OrigV);
35473 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35480 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35481 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35482 const X86Subtarget &Subtarget) {
35483 SDValue Chain = N->getOperand(0);
35484 SDValue LHS = N->getOperand(1);
35485 SDValue RHS = N->getOperand(2);
35486 MVT VT = RHS.getSimpleValueType();
35489 auto *C = dyn_cast<ConstantSDNode>(RHS);
35490 if (!C || C->getZExtValue() != 1)
35493 RHS = DAG.getConstant(-1, DL, VT);
35494 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35495 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35496 DAG.getVTList(MVT::i32, MVT::Other),
35497 {Chain, LHS, RHS}, VT, MMO);
35500 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
35501 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
35502 SDValue Op0 = N->getOperand(0);
35503 SDValue Op1 = N->getOperand(1);
35505 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
35508 EVT VT = N->getValueType(0);
35511 return DAG.getNode(X86ISD::TESTM, DL, VT,
35512 Op0->getOperand(0), Op0->getOperand(1));
35515 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35516 const X86Subtarget &Subtarget) {
35517 MVT VT = N->getSimpleValueType(0);
35520 if (N->getOperand(0) == N->getOperand(1)) {
35521 if (N->getOpcode() == X86ISD::PCMPEQ)
35522 return getOnesVector(VT, DAG, DL);
35523 if (N->getOpcode() == X86ISD::PCMPGT)
35524 return getZeroVector(VT, Subtarget, DAG, DL);
35530 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35531 TargetLowering::DAGCombinerInfo &DCI,
35532 const X86Subtarget &Subtarget) {
35533 if (DCI.isBeforeLegalizeOps())
35537 SDValue Vec = N->getOperand(0);
35538 SDValue SubVec = N->getOperand(1);
35539 SDValue Idx = N->getOperand(2);
35541 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35542 MVT OpVT = N->getSimpleValueType(0);
35543 MVT SubVecVT = SubVec.getSimpleValueType();
35545 // If this is an insert of an extract, combine to a shuffle. Don't do this
35546 // if the insert or extract can be represented with a subvector operation.
35547 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35548 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35549 (IdxVal != 0 || !Vec.isUndef())) {
35550 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35551 if (ExtIdxVal != 0) {
35552 int VecNumElts = OpVT.getVectorNumElements();
35553 int SubVecNumElts = SubVecVT.getVectorNumElements();
35554 SmallVector<int, 64> Mask(VecNumElts);
35555 // First create an identity shuffle mask.
35556 for (int i = 0; i != VecNumElts; ++i)
35558 // Now insert the extracted portion.
35559 for (int i = 0; i != SubVecNumElts; ++i)
35560 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35562 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35566 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35568 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35569 // (load16 addr + 16), Elts/2)
35572 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35573 // (load32 addr + 32), Elts/2)
35575 // or a 16-byte or 32-byte broadcast:
35576 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35577 // (load16 addr), Elts/2)
35578 // --> X86SubVBroadcast(load16 addr)
35580 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35581 // (load32 addr), Elts/2)
35582 // --> X86SubVBroadcast(load32 addr)
35583 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35584 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35585 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35586 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35587 if (Idx2 && Idx2->getZExtValue() == 0) {
35588 SDValue SubVec2 = Vec.getOperand(1);
35589 // If needed, look through bitcasts to get to the load.
35590 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35592 unsigned Alignment = FirstLd->getAlignment();
35593 unsigned AS = FirstLd->getAddressSpace();
35594 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35595 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35596 OpVT, AS, Alignment, &Fast) && Fast) {
35597 SDValue Ops[] = {SubVec2, SubVec};
35598 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35603 // If lower/upper loads are the same and the only users of the load, then
35604 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35605 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35606 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35607 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35608 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35611 // If this is subv_broadcast insert into both halves, use a larger
35613 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35614 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35615 SubVec.getOperand(0));
35624 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35625 DAGCombinerInfo &DCI) const {
35626 SelectionDAG &DAG = DCI.DAG;
35627 switch (N->getOpcode()) {
35629 case ISD::EXTRACT_VECTOR_ELT:
35630 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35631 case X86ISD::PEXTRW:
35632 case X86ISD::PEXTRB:
35633 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35634 case ISD::INSERT_SUBVECTOR:
35635 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35638 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35639 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
35640 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35641 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35642 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35643 case X86ISD::ADD: return combineX86ADD(N, DAG, DCI);
35644 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35645 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35648 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35649 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35650 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35651 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35652 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35653 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35654 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35655 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35656 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35657 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35659 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35660 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35661 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35662 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35663 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35664 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35666 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35668 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35670 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35671 case X86ISD::BT: return combineBT(N, DAG, DCI);
35672 case ISD::ANY_EXTEND:
35673 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35674 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35675 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35676 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35677 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35678 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35679 case X86ISD::VSHLI:
35680 case X86ISD::VSRAI:
35681 case X86ISD::VSRLI:
35682 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35683 case ISD::SIGN_EXTEND_VECTOR_INREG:
35684 case ISD::ZERO_EXTEND_VECTOR_INREG:
35685 case X86ISD::VSEXT:
35686 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35687 case X86ISD::PINSRB:
35688 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35689 case X86ISD::SHUFP: // Handle all target specific shuffles
35690 case X86ISD::INSERTPS:
35691 case X86ISD::EXTRQI:
35692 case X86ISD::INSERTQI:
35693 case X86ISD::PALIGNR:
35694 case X86ISD::VSHLDQ:
35695 case X86ISD::VSRLDQ:
35696 case X86ISD::BLENDI:
35697 case X86ISD::UNPCKH:
35698 case X86ISD::UNPCKL:
35699 case X86ISD::MOVHLPS:
35700 case X86ISD::MOVLHPS:
35701 case X86ISD::PSHUFB:
35702 case X86ISD::PSHUFD:
35703 case X86ISD::PSHUFHW:
35704 case X86ISD::PSHUFLW:
35705 case X86ISD::MOVSHDUP:
35706 case X86ISD::MOVSLDUP:
35707 case X86ISD::MOVDDUP:
35708 case X86ISD::MOVSS:
35709 case X86ISD::MOVSD:
35710 case X86ISD::VPPERM:
35711 case X86ISD::VPERMI:
35712 case X86ISD::VPERMV:
35713 case X86ISD::VPERMV3:
35714 case X86ISD::VPERMIV3:
35715 case X86ISD::VPERMIL2:
35716 case X86ISD::VPERMILPI:
35717 case X86ISD::VPERMILPV:
35718 case X86ISD::VPERM2X128:
35719 case X86ISD::VZEXT_MOVL:
35720 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35721 case X86ISD::FMADD:
35722 case X86ISD::FMADD_RND:
35723 case X86ISD::FMADDS1_RND:
35724 case X86ISD::FMADDS3_RND:
35725 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35727 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35728 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35729 case X86ISD::TESTM: return combineTestM(N, DAG);
35730 case X86ISD::PCMPEQ:
35731 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35737 /// Return true if the target has native support for the specified value type
35738 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35739 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35740 /// some i16 instructions are slow.
35741 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35742 if (!isTypeLegal(VT))
35744 if (VT != MVT::i16)
35751 case ISD::SIGN_EXTEND:
35752 case ISD::ZERO_EXTEND:
35753 case ISD::ANY_EXTEND:
35766 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35767 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35768 /// we don't adjust the stack we clobber the first frame index.
35769 /// See X86InstrInfo::copyPhysReg.
35770 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35771 const MachineRegisterInfo &MRI = MF.getRegInfo();
35772 return any_of(MRI.reg_instructions(X86::EFLAGS),
35773 [](const MachineInstr &RI) { return RI.isCopy(); });
35776 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35777 if (hasCopyImplyingStackAdjustment(MF)) {
35778 MachineFrameInfo &MFI = MF.getFrameInfo();
35779 MFI.setHasCopyImplyingStackAdjustment(true);
35782 TargetLoweringBase::finalizeLowering(MF);
35785 /// This method query the target whether it is beneficial for dag combiner to
35786 /// promote the specified node. If true, it should return the desired promotion
35787 /// type by reference.
35788 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35789 EVT VT = Op.getValueType();
35790 if (VT != MVT::i16)
35793 bool Promote = false;
35794 bool Commute = false;
35795 switch (Op.getOpcode()) {
35797 case ISD::SIGN_EXTEND:
35798 case ISD::ZERO_EXTEND:
35799 case ISD::ANY_EXTEND:
35804 SDValue N0 = Op.getOperand(0);
35805 // Look out for (store (shl (load), x)).
35806 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35819 SDValue N0 = Op.getOperand(0);
35820 SDValue N1 = Op.getOperand(1);
35821 if (!Commute && MayFoldLoad(N1))
35823 // Avoid disabling potential load folding opportunities.
35824 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35826 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35836 //===----------------------------------------------------------------------===//
35837 // X86 Inline Assembly Support
35838 //===----------------------------------------------------------------------===//
35840 // Helper to match a string separated by whitespace.
35841 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35842 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35844 for (StringRef Piece : Pieces) {
35845 if (!S.startswith(Piece)) // Check if the piece matches.
35848 S = S.substr(Piece.size());
35849 StringRef::size_type Pos = S.find_first_not_of(" \t");
35850 if (Pos == 0) // We matched a prefix.
35859 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35861 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35862 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35863 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35864 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35866 if (AsmPieces.size() == 3)
35868 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35875 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35876 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35878 const std::string &AsmStr = IA->getAsmString();
35880 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35881 if (!Ty || Ty->getBitWidth() % 16 != 0)
35884 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35885 SmallVector<StringRef, 4> AsmPieces;
35886 SplitString(AsmStr, AsmPieces, ";\n");
35888 switch (AsmPieces.size()) {
35889 default: return false;
35891 // FIXME: this should verify that we are targeting a 486 or better. If not,
35892 // we will turn this bswap into something that will be lowered to logical
35893 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35894 // lower so don't worry about this.
35896 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35897 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35898 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35899 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35900 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35901 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35902 // No need to check constraints, nothing other than the equivalent of
35903 // "=r,0" would be valid here.
35904 return IntrinsicLowering::LowerToByteSwap(CI);
35907 // rorw $$8, ${0:w} --> llvm.bswap.i16
35908 if (CI->getType()->isIntegerTy(16) &&
35909 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35910 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35911 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35913 StringRef ConstraintsStr = IA->getConstraintString();
35914 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35915 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35916 if (clobbersFlagRegisters(AsmPieces))
35917 return IntrinsicLowering::LowerToByteSwap(CI);
35921 if (CI->getType()->isIntegerTy(32) &&
35922 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35923 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
35924 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
35925 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
35927 StringRef ConstraintsStr = IA->getConstraintString();
35928 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35929 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35930 if (clobbersFlagRegisters(AsmPieces))
35931 return IntrinsicLowering::LowerToByteSwap(CI);
35934 if (CI->getType()->isIntegerTy(64)) {
35935 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
35936 if (Constraints.size() >= 2 &&
35937 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
35938 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
35939 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
35940 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
35941 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
35942 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
35943 return IntrinsicLowering::LowerToByteSwap(CI);
35951 /// Given a constraint letter, return the type of constraint for this target.
35952 X86TargetLowering::ConstraintType
35953 X86TargetLowering::getConstraintType(StringRef Constraint) const {
35954 if (Constraint.size() == 1) {
35955 switch (Constraint[0]) {
35967 return C_RegisterClass;
35968 case 'k': // AVX512 masking registers.
35992 else if (Constraint.size() == 2) {
35993 switch (Constraint[0]) {
35997 switch (Constraint[1]) {
36005 return TargetLowering::getConstraintType(Constraint);
36008 /// Examine constraint type and operand type and determine a weight value.
36009 /// This object must already have been set up with the operand type
36010 /// and the current alternative constraint selected.
36011 TargetLowering::ConstraintWeight
36012 X86TargetLowering::getSingleConstraintMatchWeight(
36013 AsmOperandInfo &info, const char *constraint) const {
36014 ConstraintWeight weight = CW_Invalid;
36015 Value *CallOperandVal = info.CallOperandVal;
36016 // If we don't have a value, we can't do a match,
36017 // but allow it at the lowest weight.
36018 if (!CallOperandVal)
36020 Type *type = CallOperandVal->getType();
36021 // Look at the constraint type.
36022 switch (*constraint) {
36024 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
36036 if (CallOperandVal->getType()->isIntegerTy())
36037 weight = CW_SpecificReg;
36042 if (type->isFloatingPointTy())
36043 weight = CW_SpecificReg;
36046 if (type->isX86_MMXTy() && Subtarget.hasMMX())
36047 weight = CW_SpecificReg;
36050 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
36051 if (constraint[1] == 'k') {
36052 // Support for 'Yk' (similarly to the 'k' variant below).
36053 weight = CW_SpecificReg;
36056 // Else fall through (handle "Y" constraint).
36059 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
36060 weight = CW_Register;
36063 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
36064 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
36065 weight = CW_Register;
36068 // Enable conditional vector operations using %k<#> registers.
36069 weight = CW_SpecificReg;
36072 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
36073 if (C->getZExtValue() <= 31)
36074 weight = CW_Constant;
36078 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36079 if (C->getZExtValue() <= 63)
36080 weight = CW_Constant;
36084 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36085 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
36086 weight = CW_Constant;
36090 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36091 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
36092 weight = CW_Constant;
36096 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36097 if (C->getZExtValue() <= 3)
36098 weight = CW_Constant;
36102 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36103 if (C->getZExtValue() <= 0xff)
36104 weight = CW_Constant;
36109 if (isa<ConstantFP>(CallOperandVal)) {
36110 weight = CW_Constant;
36114 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36115 if ((C->getSExtValue() >= -0x80000000LL) &&
36116 (C->getSExtValue() <= 0x7fffffffLL))
36117 weight = CW_Constant;
36121 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36122 if (C->getZExtValue() <= 0xffffffff)
36123 weight = CW_Constant;
36130 /// Try to replace an X constraint, which matches anything, with another that
36131 /// has more specific requirements based on the type of the corresponding
36133 const char *X86TargetLowering::
36134 LowerXConstraint(EVT ConstraintVT) const {
36135 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
36136 // 'f' like normal targets.
36137 if (ConstraintVT.isFloatingPoint()) {
36138 if (Subtarget.hasSSE2())
36140 if (Subtarget.hasSSE1())
36144 return TargetLowering::LowerXConstraint(ConstraintVT);
36147 /// Lower the specified operand into the Ops vector.
36148 /// If it is invalid, don't add anything to Ops.
36149 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36150 std::string &Constraint,
36151 std::vector<SDValue>&Ops,
36152 SelectionDAG &DAG) const {
36155 // Only support length 1 constraints for now.
36156 if (Constraint.length() > 1) return;
36158 char ConstraintLetter = Constraint[0];
36159 switch (ConstraintLetter) {
36162 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36163 if (C->getZExtValue() <= 31) {
36164 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36165 Op.getValueType());
36171 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36172 if (C->getZExtValue() <= 63) {
36173 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36174 Op.getValueType());
36180 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36181 if (isInt<8>(C->getSExtValue())) {
36182 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36183 Op.getValueType());
36189 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36190 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
36191 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
36192 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36193 Op.getValueType());
36199 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36200 if (C->getZExtValue() <= 3) {
36201 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36202 Op.getValueType());
36208 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36209 if (C->getZExtValue() <= 255) {
36210 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36211 Op.getValueType());
36217 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36218 if (C->getZExtValue() <= 127) {
36219 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36220 Op.getValueType());
36226 // 32-bit signed value
36227 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36228 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36229 C->getSExtValue())) {
36230 // Widen to 64 bits here to get it sign extended.
36231 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36234 // FIXME gcc accepts some relocatable values here too, but only in certain
36235 // memory models; it's complicated.
36240 // 32-bit unsigned value
36241 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36242 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36243 C->getZExtValue())) {
36244 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36245 Op.getValueType());
36249 // FIXME gcc accepts some relocatable values here too, but only in certain
36250 // memory models; it's complicated.
36254 // Literal immediates are always ok.
36255 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
36256 // Widen to 64 bits here to get it sign extended.
36257 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36261 // In any sort of PIC mode addresses need to be computed at runtime by
36262 // adding in a register or some sort of table lookup. These can't
36263 // be used as immediates.
36264 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
36267 // If we are in non-pic codegen mode, we allow the address of a global (with
36268 // an optional displacement) to be used with 'i'.
36269 GlobalAddressSDNode *GA = nullptr;
36270 int64_t Offset = 0;
36272 // Match either (GA), (GA+C), (GA+C1+C2), etc.
36274 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
36275 Offset += GA->getOffset();
36277 } else if (Op.getOpcode() == ISD::ADD) {
36278 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36279 Offset += C->getZExtValue();
36280 Op = Op.getOperand(0);
36283 } else if (Op.getOpcode() == ISD::SUB) {
36284 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36285 Offset += -C->getZExtValue();
36286 Op = Op.getOperand(0);
36291 // Otherwise, this isn't something we can handle, reject it.
36295 const GlobalValue *GV = GA->getGlobal();
36296 // If we require an extra load to get this address, as in PIC mode, we
36297 // can't accept it.
36298 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36301 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36302 GA->getValueType(0), Offset);
36307 if (Result.getNode()) {
36308 Ops.push_back(Result);
36311 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36314 /// Check if \p RC is a general purpose register class.
36315 /// I.e., GR* or one of their variant.
36316 static bool isGRClass(const TargetRegisterClass &RC) {
36317 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
36318 RC.hasSuperClassEq(&X86::GR16RegClass) ||
36319 RC.hasSuperClassEq(&X86::GR32RegClass) ||
36320 RC.hasSuperClassEq(&X86::GR64RegClass) ||
36321 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36324 /// Check if \p RC is a vector register class.
36325 /// I.e., FR* / VR* or one of their variant.
36326 static bool isFRClass(const TargetRegisterClass &RC) {
36327 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
36328 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
36329 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
36330 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
36331 RC.hasSuperClassEq(&X86::VR512RegClass);
36334 std::pair<unsigned, const TargetRegisterClass *>
36335 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36336 StringRef Constraint,
36338 // First, see if this is a constraint that directly corresponds to an LLVM
36340 if (Constraint.size() == 1) {
36341 // GCC Constraint Letters
36342 switch (Constraint[0]) {
36344 // TODO: Slight differences here in allocation order and leaving
36345 // RIP in the class. Do they matter any more here than they do
36346 // in the normal allocation?
36348 if (Subtarget.hasAVX512()) {
36349 // Only supported in AVX512 or later.
36350 switch (VT.SimpleTy) {
36353 return std::make_pair(0U, &X86::VK32RegClass);
36355 return std::make_pair(0U, &X86::VK16RegClass);
36357 return std::make_pair(0U, &X86::VK8RegClass);
36359 return std::make_pair(0U, &X86::VK1RegClass);
36361 return std::make_pair(0U, &X86::VK64RegClass);
36365 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36366 if (Subtarget.is64Bit()) {
36367 if (VT == MVT::i32 || VT == MVT::f32)
36368 return std::make_pair(0U, &X86::GR32RegClass);
36369 if (VT == MVT::i16)
36370 return std::make_pair(0U, &X86::GR16RegClass);
36371 if (VT == MVT::i8 || VT == MVT::i1)
36372 return std::make_pair(0U, &X86::GR8RegClass);
36373 if (VT == MVT::i64 || VT == MVT::f64)
36374 return std::make_pair(0U, &X86::GR64RegClass);
36378 // 32-bit fallthrough
36379 case 'Q': // Q_REGS
36380 if (VT == MVT::i32 || VT == MVT::f32)
36381 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36382 if (VT == MVT::i16)
36383 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36384 if (VT == MVT::i8 || VT == MVT::i1)
36385 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36386 if (VT == MVT::i64)
36387 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36389 case 'r': // GENERAL_REGS
36390 case 'l': // INDEX_REGS
36391 if (VT == MVT::i8 || VT == MVT::i1)
36392 return std::make_pair(0U, &X86::GR8RegClass);
36393 if (VT == MVT::i16)
36394 return std::make_pair(0U, &X86::GR16RegClass);
36395 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
36396 return std::make_pair(0U, &X86::GR32RegClass);
36397 return std::make_pair(0U, &X86::GR64RegClass);
36398 case 'R': // LEGACY_REGS
36399 if (VT == MVT::i8 || VT == MVT::i1)
36400 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36401 if (VT == MVT::i16)
36402 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36403 if (VT == MVT::i32 || !Subtarget.is64Bit())
36404 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36405 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36406 case 'f': // FP Stack registers.
36407 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
36408 // value to the correct fpstack register class.
36409 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
36410 return std::make_pair(0U, &X86::RFP32RegClass);
36411 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
36412 return std::make_pair(0U, &X86::RFP64RegClass);
36413 return std::make_pair(0U, &X86::RFP80RegClass);
36414 case 'y': // MMX_REGS if MMX allowed.
36415 if (!Subtarget.hasMMX()) break;
36416 return std::make_pair(0U, &X86::VR64RegClass);
36417 case 'Y': // SSE_REGS if SSE2 allowed
36418 if (!Subtarget.hasSSE2()) break;
36421 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36422 if (!Subtarget.hasSSE1()) break;
36423 bool VConstraint = (Constraint[0] == 'v');
36425 switch (VT.SimpleTy) {
36427 // Scalar SSE types.
36430 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
36431 return std::make_pair(0U, &X86::FR32XRegClass);
36432 return std::make_pair(0U, &X86::FR32RegClass);
36435 if (VConstraint && Subtarget.hasVLX())
36436 return std::make_pair(0U, &X86::FR64XRegClass);
36437 return std::make_pair(0U, &X86::FR64RegClass);
36438 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36446 if (VConstraint && Subtarget.hasVLX())
36447 return std::make_pair(0U, &X86::VR128XRegClass);
36448 return std::make_pair(0U, &X86::VR128RegClass);
36456 if (VConstraint && Subtarget.hasVLX())
36457 return std::make_pair(0U, &X86::VR256XRegClass);
36458 return std::make_pair(0U, &X86::VR256RegClass);
36463 return std::make_pair(0U, &X86::VR512RegClass);
36467 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
36468 switch (Constraint[1]) {
36472 // This register class doesn't allocate k0 for masked vector operation.
36473 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
36474 switch (VT.SimpleTy) {
36477 return std::make_pair(0U, &X86::VK32WMRegClass);
36479 return std::make_pair(0U, &X86::VK16WMRegClass);
36481 return std::make_pair(0U, &X86::VK8WMRegClass);
36483 return std::make_pair(0U, &X86::VK1WMRegClass);
36485 return std::make_pair(0U, &X86::VK64WMRegClass);
36492 // Use the default implementation in TargetLowering to convert the register
36493 // constraint into a member of a register class.
36494 std::pair<unsigned, const TargetRegisterClass*> Res;
36495 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36497 // Not found as a standard register?
36499 // Map st(0) -> st(7) -> ST0
36500 if (Constraint.size() == 7 && Constraint[0] == '{' &&
36501 tolower(Constraint[1]) == 's' &&
36502 tolower(Constraint[2]) == 't' &&
36503 Constraint[3] == '(' &&
36504 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
36505 Constraint[5] == ')' &&
36506 Constraint[6] == '}') {
36508 Res.first = X86::FP0+Constraint[4]-'0';
36509 Res.second = &X86::RFP80RegClass;
36513 // GCC allows "st(0)" to be called just plain "st".
36514 if (StringRef("{st}").equals_lower(Constraint)) {
36515 Res.first = X86::FP0;
36516 Res.second = &X86::RFP80RegClass;
36521 if (StringRef("{flags}").equals_lower(Constraint)) {
36522 Res.first = X86::EFLAGS;
36523 Res.second = &X86::CCRRegClass;
36527 // 'A' means [ER]AX + [ER]DX.
36528 if (Constraint == "A") {
36529 if (Subtarget.is64Bit()) {
36530 Res.first = X86::RAX;
36531 Res.second = &X86::GR64_ADRegClass;
36533 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36534 "Expecting 64, 32 or 16 bit subtarget");
36535 Res.first = X86::EAX;
36536 Res.second = &X86::GR32_ADRegClass;
36543 // Otherwise, check to see if this is a register class of the wrong value
36544 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36545 // turn into {ax},{dx}.
36546 // MVT::Other is used to specify clobber names.
36547 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36548 return Res; // Correct type already, nothing to do.
36550 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36551 // return "eax". This should even work for things like getting 64bit integer
36552 // registers when given an f64 type.
36553 const TargetRegisterClass *Class = Res.second;
36554 // The generic code will match the first register class that contains the
36555 // given register. Thus, based on the ordering of the tablegened file,
36556 // the "plain" GR classes might not come first.
36557 // Therefore, use a helper method.
36558 if (isGRClass(*Class)) {
36559 unsigned Size = VT.getSizeInBits();
36560 if (Size == 1) Size = 8;
36561 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36563 Res.first = DestReg;
36564 Res.second = Size == 8 ? &X86::GR8RegClass
36565 : Size == 16 ? &X86::GR16RegClass
36566 : Size == 32 ? &X86::GR32RegClass
36567 : &X86::GR64RegClass;
36568 assert(Res.second->contains(Res.first) && "Register in register class");
36570 // No register found/type mismatch.
36572 Res.second = nullptr;
36574 } else if (isFRClass(*Class)) {
36575 // Handle references to XMM physical registers that got mapped into the
36576 // wrong class. This can happen with constraints like {xmm0} where the
36577 // target independent register mapper will just pick the first match it can
36578 // find, ignoring the required type.
36580 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36581 if (VT == MVT::f32 || VT == MVT::i32)
36582 Res.second = &X86::FR32RegClass;
36583 else if (VT == MVT::f64 || VT == MVT::i64)
36584 Res.second = &X86::FR64RegClass;
36585 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36586 Res.second = &X86::VR128RegClass;
36587 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36588 Res.second = &X86::VR256RegClass;
36589 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36590 Res.second = &X86::VR512RegClass;
36592 // Type mismatch and not a clobber: Return an error;
36594 Res.second = nullptr;
36601 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36602 const AddrMode &AM, Type *Ty,
36603 unsigned AS) const {
36604 // Scaling factors are not free at all.
36605 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36606 // will take 2 allocations in the out of order engine instead of 1
36607 // for plain addressing mode, i.e. inst (reg1).
36609 // vaddps (%rsi,%drx), %ymm0, %ymm1
36610 // Requires two allocations (one for the load, one for the computation)
36612 // vaddps (%rsi), %ymm0, %ymm1
36613 // Requires just 1 allocation, i.e., freeing allocations for other operations
36614 // and having less micro operations to execute.
36616 // For some X86 architectures, this is even worse because for instance for
36617 // stores, the complex addressing mode forces the instruction to use the
36618 // "load" ports instead of the dedicated "store" port.
36619 // E.g., on Haswell:
36620 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36621 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36622 if (isLegalAddressingMode(DL, AM, Ty, AS))
36623 // Scale represents reg2 * scale, thus account for 1
36624 // as soon as we use a second register.
36625 return AM.Scale != 0;
36629 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36630 // Integer division on x86 is expensive. However, when aggressively optimizing
36631 // for code size, we prefer to use a div instruction, as it is usually smaller
36632 // than the alternative sequence.
36633 // The exception to this is vector division. Since x86 doesn't have vector
36634 // integer division, leaving the division as-is is a loss even in terms of
36635 // size, because it will have to be scalarized, while the alternative code
36636 // sequence can be performed in vector form.
36638 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36639 return OptSize && !VT.isVector();
36642 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36643 if (!Subtarget.is64Bit())
36646 // Update IsSplitCSR in X86MachineFunctionInfo.
36647 X86MachineFunctionInfo *AFI =
36648 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36649 AFI->setIsSplitCSR(true);
36652 void X86TargetLowering::insertCopiesSplitCSR(
36653 MachineBasicBlock *Entry,
36654 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36655 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36656 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36660 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36661 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36662 MachineBasicBlock::iterator MBBI = Entry->begin();
36663 for (const MCPhysReg *I = IStart; *I; ++I) {
36664 const TargetRegisterClass *RC = nullptr;
36665 if (X86::GR64RegClass.contains(*I))
36666 RC = &X86::GR64RegClass;
36668 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36670 unsigned NewVR = MRI->createVirtualRegister(RC);
36671 // Create copy from CSR to a virtual register.
36672 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36673 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36674 // nounwind. If we want to generalize this later, we may need to emit
36675 // CFI pseudo-instructions.
36676 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36677 Attribute::NoUnwind) &&
36678 "Function should be nounwind in insertCopiesSplitCSR!");
36679 Entry->addLiveIn(*I);
36680 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36683 // Insert the copy-back instructions right before the terminator.
36684 for (auto *Exit : Exits)
36685 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36686 TII->get(TargetOpcode::COPY), *I)
36691 bool X86TargetLowering::supportSwiftError() const {
36692 return Subtarget.is64Bit();
36695 /// Returns the name of the symbol used to emit stack probes or the empty
36696 /// string if not applicable.
36697 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
36698 // If the function specifically requests stack probes, emit them.
36699 if (MF.getFunction()->hasFnAttribute("probe-stack"))
36700 return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
36702 // Generally, if we aren't on Windows, the platform ABI does not include
36703 // support for stack probes, so don't emit them.
36704 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
36707 // We need a stack probe to conform to the Windows ABI. Choose the right
36709 if (Subtarget.is64Bit())
36710 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
36711 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";