1 //===-- X86ISelLowering.cpp - X86 DAG Lowering Implementation -------------===//
3 // The LLVM Compiler Infrastructure
5 // This file is distributed under the University of Illinois Open Source
6 // License. See LICENSE.TXT for details.
8 //===----------------------------------------------------------------------===//
10 // This file defines the interfaces that X86 uses to lower LLVM code into a
13 //===----------------------------------------------------------------------===//
15 #include "X86ISelLowering.h"
16 #include "Utils/X86ShuffleDecode.h"
17 #include "X86CallingConv.h"
18 #include "X86FrameLowering.h"
19 #include "X86InstrBuilder.h"
20 #include "X86IntrinsicsInfo.h"
21 #include "X86MachineFunctionInfo.h"
22 #include "X86ShuffleDecodeConstantPool.h"
23 #include "X86TargetMachine.h"
24 #include "X86TargetObjectFile.h"
25 #include "llvm/ADT/SmallBitVector.h"
26 #include "llvm/ADT/SmallSet.h"
27 #include "llvm/ADT/Statistic.h"
28 #include "llvm/ADT/StringExtras.h"
29 #include "llvm/ADT/StringSwitch.h"
30 #include "llvm/Analysis/EHPersonalities.h"
31 #include "llvm/CodeGen/IntrinsicLowering.h"
32 #include "llvm/CodeGen/MachineFrameInfo.h"
33 #include "llvm/CodeGen/MachineFunction.h"
34 #include "llvm/CodeGen/MachineInstrBuilder.h"
35 #include "llvm/CodeGen/MachineJumpTableInfo.h"
36 #include "llvm/CodeGen/MachineModuleInfo.h"
37 #include "llvm/CodeGen/MachineRegisterInfo.h"
38 #include "llvm/CodeGen/WinEHFuncInfo.h"
39 #include "llvm/IR/CallSite.h"
40 #include "llvm/IR/CallingConv.h"
41 #include "llvm/IR/Constants.h"
42 #include "llvm/IR/DerivedTypes.h"
43 #include "llvm/IR/DiagnosticInfo.h"
44 #include "llvm/IR/Function.h"
45 #include "llvm/IR/GlobalAlias.h"
46 #include "llvm/IR/GlobalVariable.h"
47 #include "llvm/IR/Instructions.h"
48 #include "llvm/IR/Intrinsics.h"
49 #include "llvm/MC/MCAsmInfo.h"
50 #include "llvm/MC/MCContext.h"
51 #include "llvm/MC/MCExpr.h"
52 #include "llvm/MC/MCSymbol.h"
53 #include "llvm/Support/CommandLine.h"
54 #include "llvm/Support/Debug.h"
55 #include "llvm/Support/ErrorHandling.h"
56 #include "llvm/Support/KnownBits.h"
57 #include "llvm/Support/MathExtras.h"
58 #include "llvm/Target/TargetLowering.h"
59 #include "llvm/Target/TargetOptions.h"
66 #define DEBUG_TYPE "x86-isel"
68 STATISTIC(NumTailCalls, "Number of tail calls");
70 static cl::opt<bool> ExperimentalVectorWideningLegalization(
71 "x86-experimental-vector-widening-legalization", cl::init(false),
72 cl::desc("Enable an experimental vector type legalization through widening "
73 "rather than promotion."),
76 static cl::opt<int> ExperimentalPrefLoopAlignment(
77 "x86-experimental-pref-loop-alignment", cl::init(4),
78 cl::desc("Sets the preferable loop alignment for experiments "
79 "(the last x86-experimental-pref-loop-alignment bits"
80 " of the loop header PC will be 0)."),
83 static cl::opt<bool> MulConstantOptimization(
84 "mul-constant-optimization", cl::init(true),
85 cl::desc("Replace 'mul x, Const' with more effective instructions like "
89 /// Call this when the user attempts to do something unsupported, like
90 /// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike
91 /// report_fatal_error, so calling code should attempt to recover without
93 static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
95 MachineFunction &MF = DAG.getMachineFunction();
96 DAG.getContext()->diagnose(
97 DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
100 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
101 const X86Subtarget &STI)
102 : TargetLowering(TM), Subtarget(STI) {
103 bool UseX87 = !Subtarget.useSoftFloat() && Subtarget.hasX87();
104 X86ScalarSSEf64 = Subtarget.hasSSE2();
105 X86ScalarSSEf32 = Subtarget.hasSSE1();
106 MVT PtrVT = MVT::getIntegerVT(8 * TM.getPointerSize());
108 // Set up the TargetLowering object.
110 // X86 is weird. It always uses i8 for shift amounts and setcc results.
111 setBooleanContents(ZeroOrOneBooleanContent);
112 // X86-SSE is even stranger. It uses -1 or 0 for vector masks.
113 setBooleanVectorContents(ZeroOrNegativeOneBooleanContent);
115 // For 64-bit, since we have so many registers, use the ILP scheduler.
116 // For 32-bit, use the register pressure specific scheduling.
117 // For Atom, always use ILP scheduling.
118 if (Subtarget.isAtom())
119 setSchedulingPreference(Sched::ILP);
120 else if (Subtarget.is64Bit())
121 setSchedulingPreference(Sched::ILP);
123 setSchedulingPreference(Sched::RegPressure);
124 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
125 setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
127 // Bypass expensive divides and use cheaper ones.
128 if (TM.getOptLevel() >= CodeGenOpt::Default) {
129 if (Subtarget.hasSlowDivide32())
130 addBypassSlowDiv(32, 8);
131 if (Subtarget.hasSlowDivide64() && Subtarget.is64Bit())
132 addBypassSlowDiv(64, 32);
135 if (Subtarget.isTargetKnownWindowsMSVC() ||
136 Subtarget.isTargetWindowsItanium()) {
137 // Setup Windows compiler runtime calls.
138 setLibcallName(RTLIB::SDIV_I64, "_alldiv");
139 setLibcallName(RTLIB::UDIV_I64, "_aulldiv");
140 setLibcallName(RTLIB::SREM_I64, "_allrem");
141 setLibcallName(RTLIB::UREM_I64, "_aullrem");
142 setLibcallName(RTLIB::MUL_I64, "_allmul");
143 setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::X86_StdCall);
144 setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::X86_StdCall);
145 setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::X86_StdCall);
146 setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::X86_StdCall);
147 setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
150 if (Subtarget.isTargetDarwin()) {
151 // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
152 setUseUnderscoreSetJmp(false);
153 setUseUnderscoreLongJmp(false);
154 } else if (Subtarget.isTargetWindowsGNU()) {
155 // MS runtime is weird: it exports _setjmp, but longjmp!
156 setUseUnderscoreSetJmp(true);
157 setUseUnderscoreLongJmp(false);
159 setUseUnderscoreSetJmp(true);
160 setUseUnderscoreLongJmp(true);
163 // Set up the register classes.
164 addRegisterClass(MVT::i8, &X86::GR8RegClass);
165 addRegisterClass(MVT::i16, &X86::GR16RegClass);
166 addRegisterClass(MVT::i32, &X86::GR32RegClass);
167 if (Subtarget.is64Bit())
168 addRegisterClass(MVT::i64, &X86::GR64RegClass);
170 for (MVT VT : MVT::integer_valuetypes())
171 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
173 // We don't accept any truncstore of integer registers.
174 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
175 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
176 setTruncStoreAction(MVT::i64, MVT::i8 , Expand);
177 setTruncStoreAction(MVT::i32, MVT::i16, Expand);
178 setTruncStoreAction(MVT::i32, MVT::i8 , Expand);
179 setTruncStoreAction(MVT::i16, MVT::i8, Expand);
181 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
183 // SETOEQ and SETUNE require checking two conditions.
184 setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
185 setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
186 setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
187 setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
188 setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
189 setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
191 // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
193 setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
194 setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
195 setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
197 if (Subtarget.is64Bit()) {
198 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512())
199 // f32/f64 are legal, f80 is custom.
200 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
202 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Promote);
203 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
204 } else if (!Subtarget.useSoftFloat()) {
205 // We have an algorithm for SSE2->double, and we turn this into a
206 // 64-bit FILD followed by conditional FADD for other targets.
207 setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
208 // We have an algorithm for SSE2, and we turn this into a 64-bit
209 // FILD or VCVTUSI2SS/SD for other targets.
210 setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
213 // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
215 setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
216 setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
218 if (!Subtarget.useSoftFloat()) {
219 // SSE has no i16 to fp conversion, only i32.
220 if (X86ScalarSSEf32) {
221 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
222 // f32 and f64 cases are Legal, f80 case is not
223 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
225 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
226 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
229 setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
230 setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Promote);
233 // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
235 setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
236 setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
238 if (!Subtarget.useSoftFloat()) {
239 // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
240 // are Legal, f80 is custom lowered.
241 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
242 setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
244 if (X86ScalarSSEf32) {
245 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
246 // f32 and f64 cases are Legal, f80 case is not
247 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
249 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
250 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
253 setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
254 setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
255 setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
258 // Handle FP_TO_UINT by promoting the destination to a larger signed
260 setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
261 setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
262 setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
264 if (Subtarget.is64Bit()) {
265 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
266 // FP_TO_UINT-i32/i64 is legal for f32/f64, but custom for f80.
267 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
268 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
270 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Promote);
271 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Expand);
273 } else if (!Subtarget.useSoftFloat()) {
274 // Since AVX is a superset of SSE3, only check for SSE here.
275 if (Subtarget.hasSSE1() && !Subtarget.hasSSE3())
276 // Expand FP_TO_UINT into a select.
277 // FIXME: We would like to use a Custom expander here eventually to do
278 // the optimal thing for SSE vs. the default expansion in the legalizer.
279 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Expand);
281 // With AVX512 we can use vcvts[ds]2usi for f32/f64->i32, f80 is custom.
282 // With SSE3 we can use fisttpll to convert to a signed i64; without
283 // SSE, we're stuck with a fistpll.
284 setOperationAction(ISD::FP_TO_UINT , MVT::i32 , Custom);
286 setOperationAction(ISD::FP_TO_UINT , MVT::i64 , Custom);
289 // TODO: when we have SSE, these could be more efficient, by using movd/movq.
290 if (!X86ScalarSSEf64) {
291 setOperationAction(ISD::BITCAST , MVT::f32 , Expand);
292 setOperationAction(ISD::BITCAST , MVT::i32 , Expand);
293 if (Subtarget.is64Bit()) {
294 setOperationAction(ISD::BITCAST , MVT::f64 , Expand);
295 // Without SSE, i64->f64 goes through memory.
296 setOperationAction(ISD::BITCAST , MVT::i64 , Expand);
298 } else if (!Subtarget.is64Bit())
299 setOperationAction(ISD::BITCAST , MVT::i64 , Custom);
301 // Scalar integer divide and remainder are lowered to use operations that
302 // produce two results, to match the available instructions. This exposes
303 // the two-result form to trivial CSE, which is able to combine x/y and x%y
304 // into a single instruction.
306 // Scalar integer multiply-high is also lowered to use two-result
307 // operations, to match the available instructions. However, plain multiply
308 // (low) operations are left as Legal, as there are single-result
309 // instructions for this in x86. Using the two-result multiply instructions
310 // when both high and low results are needed must be arranged by dagcombine.
311 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
312 setOperationAction(ISD::MULHS, VT, Expand);
313 setOperationAction(ISD::MULHU, VT, Expand);
314 setOperationAction(ISD::SDIV, VT, Expand);
315 setOperationAction(ISD::UDIV, VT, Expand);
316 setOperationAction(ISD::SREM, VT, Expand);
317 setOperationAction(ISD::UREM, VT, Expand);
320 setOperationAction(ISD::BR_JT , MVT::Other, Expand);
321 setOperationAction(ISD::BRCOND , MVT::Other, Custom);
322 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128,
323 MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
324 setOperationAction(ISD::BR_CC, VT, Expand);
325 setOperationAction(ISD::SELECT_CC, VT, Expand);
327 if (Subtarget.is64Bit())
328 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i32, Legal);
329 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i16 , Legal);
330 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i8 , Legal);
331 setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1 , Expand);
332 setOperationAction(ISD::FP_ROUND_INREG , MVT::f32 , Expand);
334 setOperationAction(ISD::FREM , MVT::f32 , Expand);
335 setOperationAction(ISD::FREM , MVT::f64 , Expand);
336 setOperationAction(ISD::FREM , MVT::f80 , Expand);
337 setOperationAction(ISD::FLT_ROUNDS_ , MVT::i32 , Custom);
339 // Promote the i8 variants and force them on up to i32 which has a shorter
341 setOperationPromotedToType(ISD::CTTZ , MVT::i8 , MVT::i32);
342 setOperationPromotedToType(ISD::CTTZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
343 if (!Subtarget.hasBMI()) {
344 setOperationAction(ISD::CTTZ , MVT::i16 , Custom);
345 setOperationAction(ISD::CTTZ , MVT::i32 , Custom);
346 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i16 , Legal);
347 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i32 , Legal);
348 if (Subtarget.is64Bit()) {
349 setOperationAction(ISD::CTTZ , MVT::i64 , Custom);
350 setOperationAction(ISD::CTTZ_ZERO_UNDEF, MVT::i64, Legal);
354 if (Subtarget.hasLZCNT()) {
355 // When promoting the i8 variants, force them to i32 for a shorter
357 setOperationPromotedToType(ISD::CTLZ , MVT::i8 , MVT::i32);
358 setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , MVT::i32);
360 setOperationAction(ISD::CTLZ , MVT::i8 , Custom);
361 setOperationAction(ISD::CTLZ , MVT::i16 , Custom);
362 setOperationAction(ISD::CTLZ , MVT::i32 , Custom);
363 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8 , Custom);
364 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16 , Custom);
365 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32 , Custom);
366 if (Subtarget.is64Bit()) {
367 setOperationAction(ISD::CTLZ , MVT::i64 , Custom);
368 setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
372 // Special handling for half-precision floating point conversions.
373 // If we don't have F16C support, then lower half float conversions
374 // into library calls.
375 if (Subtarget.useSoftFloat() ||
376 (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
377 setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
378 setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
381 // There's never any support for operations beyond MVT::f32.
382 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
383 setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
384 setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
385 setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
387 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
388 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
389 setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
390 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
391 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
392 setTruncStoreAction(MVT::f80, MVT::f16, Expand);
394 if (Subtarget.hasPOPCNT()) {
395 setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
397 setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
398 setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
399 setOperationAction(ISD::CTPOP , MVT::i32 , Expand);
400 if (Subtarget.is64Bit())
401 setOperationAction(ISD::CTPOP , MVT::i64 , Expand);
404 setOperationAction(ISD::READCYCLECOUNTER , MVT::i64 , Custom);
406 if (!Subtarget.hasMOVBE())
407 setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
409 // These should be promoted to a larger select which is supported.
410 setOperationAction(ISD::SELECT , MVT::i1 , Promote);
411 // X86 wants to expand cmov itself.
412 for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
413 setOperationAction(ISD::SELECT, VT, Custom);
414 setOperationAction(ISD::SETCC, VT, Custom);
416 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
417 if (VT == MVT::i64 && !Subtarget.is64Bit())
419 setOperationAction(ISD::SELECT, VT, Custom);
420 setOperationAction(ISD::SETCC, VT, Custom);
423 // Custom action for SELECT MMX and expand action for SELECT_CC MMX
424 setOperationAction(ISD::SELECT, MVT::x86mmx, Custom);
425 setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
427 setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
428 // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
429 // SjLj exception handling but a light-weight setjmp/longjmp replacement to
430 // support continuation, user-level threading, and etc.. As a result, no
431 // other SjLj exception interfaces are implemented and please don't build
432 // your own exception handling based on them.
433 // LLVM/Clang supports zero-cost DWARF exception handling.
434 setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
435 setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
436 setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
437 if (TM.Options.ExceptionModel == ExceptionHandling::SjLj)
438 setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
441 for (auto VT : { MVT::i32, MVT::i64 }) {
442 if (VT == MVT::i64 && !Subtarget.is64Bit())
444 setOperationAction(ISD::ConstantPool , VT, Custom);
445 setOperationAction(ISD::JumpTable , VT, Custom);
446 setOperationAction(ISD::GlobalAddress , VT, Custom);
447 setOperationAction(ISD::GlobalTLSAddress, VT, Custom);
448 setOperationAction(ISD::ExternalSymbol , VT, Custom);
449 setOperationAction(ISD::BlockAddress , VT, Custom);
452 // 64-bit shl, sra, srl (iff 32-bit x86)
453 for (auto VT : { MVT::i32, MVT::i64 }) {
454 if (VT == MVT::i64 && !Subtarget.is64Bit())
456 setOperationAction(ISD::SHL_PARTS, VT, Custom);
457 setOperationAction(ISD::SRA_PARTS, VT, Custom);
458 setOperationAction(ISD::SRL_PARTS, VT, Custom);
461 if (Subtarget.hasSSE1())
462 setOperationAction(ISD::PREFETCH , MVT::Other, Legal);
464 setOperationAction(ISD::ATOMIC_FENCE , MVT::Other, Custom);
466 // Expand certain atomics
467 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
468 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Custom);
469 setOperationAction(ISD::ATOMIC_LOAD_SUB, VT, Custom);
470 setOperationAction(ISD::ATOMIC_LOAD_ADD, VT, Custom);
471 setOperationAction(ISD::ATOMIC_LOAD_OR, VT, Custom);
472 setOperationAction(ISD::ATOMIC_LOAD_XOR, VT, Custom);
473 setOperationAction(ISD::ATOMIC_LOAD_AND, VT, Custom);
474 setOperationAction(ISD::ATOMIC_STORE, VT, Custom);
477 if (Subtarget.hasCmpxchg16b()) {
478 setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, MVT::i128, Custom);
481 // FIXME - use subtarget debug flags
482 if (!Subtarget.isTargetDarwin() && !Subtarget.isTargetELF() &&
483 !Subtarget.isTargetCygMing() && !Subtarget.isTargetWin64() &&
484 TM.Options.ExceptionModel != ExceptionHandling::SjLj) {
485 setOperationAction(ISD::EH_LABEL, MVT::Other, Expand);
488 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
489 setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i64, Custom);
491 setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
492 setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
494 setOperationAction(ISD::TRAP, MVT::Other, Legal);
495 setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
497 // VASTART needs to be custom lowered to use the VarArgsFrameIndex
498 setOperationAction(ISD::VASTART , MVT::Other, Custom);
499 setOperationAction(ISD::VAEND , MVT::Other, Expand);
500 bool Is64Bit = Subtarget.is64Bit();
501 setOperationAction(ISD::VAARG, MVT::Other, Is64Bit ? Custom : Expand);
502 setOperationAction(ISD::VACOPY, MVT::Other, Is64Bit ? Custom : Expand);
504 setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
505 setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
507 setOperationAction(ISD::DYNAMIC_STACKALLOC, PtrVT, Custom);
509 // GC_TRANSITION_START and GC_TRANSITION_END need custom lowering.
510 setOperationAction(ISD::GC_TRANSITION_START, MVT::Other, Custom);
511 setOperationAction(ISD::GC_TRANSITION_END, MVT::Other, Custom);
513 if (!Subtarget.useSoftFloat() && X86ScalarSSEf64) {
514 // f32 and f64 use SSE.
515 // Set up the FP register classes.
516 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
517 : &X86::FR32RegClass);
518 addRegisterClass(MVT::f64, Subtarget.hasAVX512() ? &X86::FR64XRegClass
519 : &X86::FR64RegClass);
521 for (auto VT : { MVT::f32, MVT::f64 }) {
522 // Use ANDPD to simulate FABS.
523 setOperationAction(ISD::FABS, VT, Custom);
525 // Use XORP to simulate FNEG.
526 setOperationAction(ISD::FNEG, VT, Custom);
528 // Use ANDPD and ORPD to simulate FCOPYSIGN.
529 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
531 // We don't support sin/cos/fmod
532 setOperationAction(ISD::FSIN , VT, Expand);
533 setOperationAction(ISD::FCOS , VT, Expand);
534 setOperationAction(ISD::FSINCOS, VT, Expand);
537 // Lower this to MOVMSK plus an AND.
538 setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
539 setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
541 // Expand FP immediates into loads from the stack, except for the special
543 addLegalFPImmediate(APFloat(+0.0)); // xorpd
544 addLegalFPImmediate(APFloat(+0.0f)); // xorps
545 } else if (UseX87 && X86ScalarSSEf32) {
546 // Use SSE for f32, x87 for f64.
547 // Set up the FP register classes.
548 addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
549 : &X86::FR32RegClass);
550 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
552 // Use ANDPS to simulate FABS.
553 setOperationAction(ISD::FABS , MVT::f32, Custom);
555 // Use XORP to simulate FNEG.
556 setOperationAction(ISD::FNEG , MVT::f32, Custom);
558 setOperationAction(ISD::UNDEF, MVT::f64, Expand);
560 // Use ANDPS and ORPS to simulate FCOPYSIGN.
561 setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
562 setOperationAction(ISD::FCOPYSIGN, MVT::f32, Custom);
564 // We don't support sin/cos/fmod
565 setOperationAction(ISD::FSIN , MVT::f32, Expand);
566 setOperationAction(ISD::FCOS , MVT::f32, Expand);
567 setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
569 // Special cases we handle for FP constants.
570 addLegalFPImmediate(APFloat(+0.0f)); // xorps
571 addLegalFPImmediate(APFloat(+0.0)); // FLD0
572 addLegalFPImmediate(APFloat(+1.0)); // FLD1
573 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
574 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
576 if (!TM.Options.UnsafeFPMath) {
577 setOperationAction(ISD::FSIN , MVT::f64, Expand);
578 setOperationAction(ISD::FCOS , MVT::f64, Expand);
579 setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
582 // f32 and f64 in x87.
583 // Set up the FP register classes.
584 addRegisterClass(MVT::f64, &X86::RFP64RegClass);
585 addRegisterClass(MVT::f32, &X86::RFP32RegClass);
587 for (auto VT : { MVT::f32, MVT::f64 }) {
588 setOperationAction(ISD::UNDEF, VT, Expand);
589 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
591 if (!TM.Options.UnsafeFPMath) {
592 setOperationAction(ISD::FSIN , VT, Expand);
593 setOperationAction(ISD::FCOS , VT, Expand);
594 setOperationAction(ISD::FSINCOS, VT, Expand);
597 addLegalFPImmediate(APFloat(+0.0)); // FLD0
598 addLegalFPImmediate(APFloat(+1.0)); // FLD1
599 addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
600 addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
601 addLegalFPImmediate(APFloat(+0.0f)); // FLD0
602 addLegalFPImmediate(APFloat(+1.0f)); // FLD1
603 addLegalFPImmediate(APFloat(-0.0f)); // FLD0/FCHS
604 addLegalFPImmediate(APFloat(-1.0f)); // FLD1/FCHS
607 // We don't support FMA.
608 setOperationAction(ISD::FMA, MVT::f64, Expand);
609 setOperationAction(ISD::FMA, MVT::f32, Expand);
611 // Long double always uses X87, except f128 in MMX.
613 if (Subtarget.is64Bit() && Subtarget.hasMMX()) {
614 addRegisterClass(MVT::f128, &X86::FR128RegClass);
615 ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
616 setOperationAction(ISD::FABS , MVT::f128, Custom);
617 setOperationAction(ISD::FNEG , MVT::f128, Custom);
618 setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
621 addRegisterClass(MVT::f80, &X86::RFP80RegClass);
622 setOperationAction(ISD::UNDEF, MVT::f80, Expand);
623 setOperationAction(ISD::FCOPYSIGN, MVT::f80, Expand);
625 APFloat TmpFlt = APFloat::getZero(APFloat::x87DoubleExtended());
626 addLegalFPImmediate(TmpFlt); // FLD0
628 addLegalFPImmediate(TmpFlt); // FLD0/FCHS
631 APFloat TmpFlt2(+1.0);
632 TmpFlt2.convert(APFloat::x87DoubleExtended(), APFloat::rmNearestTiesToEven,
634 addLegalFPImmediate(TmpFlt2); // FLD1
635 TmpFlt2.changeSign();
636 addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
639 if (!TM.Options.UnsafeFPMath) {
640 setOperationAction(ISD::FSIN , MVT::f80, Expand);
641 setOperationAction(ISD::FCOS , MVT::f80, Expand);
642 setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
645 setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
646 setOperationAction(ISD::FCEIL, MVT::f80, Expand);
647 setOperationAction(ISD::FTRUNC, MVT::f80, Expand);
648 setOperationAction(ISD::FRINT, MVT::f80, Expand);
649 setOperationAction(ISD::FNEARBYINT, MVT::f80, Expand);
650 setOperationAction(ISD::FMA, MVT::f80, Expand);
653 // Always use a library call for pow.
654 setOperationAction(ISD::FPOW , MVT::f32 , Expand);
655 setOperationAction(ISD::FPOW , MVT::f64 , Expand);
656 setOperationAction(ISD::FPOW , MVT::f80 , Expand);
658 setOperationAction(ISD::FLOG, MVT::f80, Expand);
659 setOperationAction(ISD::FLOG2, MVT::f80, Expand);
660 setOperationAction(ISD::FLOG10, MVT::f80, Expand);
661 setOperationAction(ISD::FEXP, MVT::f80, Expand);
662 setOperationAction(ISD::FEXP2, MVT::f80, Expand);
663 setOperationAction(ISD::FMINNUM, MVT::f80, Expand);
664 setOperationAction(ISD::FMAXNUM, MVT::f80, Expand);
666 // Some FP actions are always expanded for vector types.
667 for (auto VT : { MVT::v4f32, MVT::v8f32, MVT::v16f32,
668 MVT::v2f64, MVT::v4f64, MVT::v8f64 }) {
669 setOperationAction(ISD::FSIN, VT, Expand);
670 setOperationAction(ISD::FSINCOS, VT, Expand);
671 setOperationAction(ISD::FCOS, VT, Expand);
672 setOperationAction(ISD::FREM, VT, Expand);
673 setOperationAction(ISD::FCOPYSIGN, VT, Expand);
674 setOperationAction(ISD::FPOW, VT, Expand);
675 setOperationAction(ISD::FLOG, VT, Expand);
676 setOperationAction(ISD::FLOG2, VT, Expand);
677 setOperationAction(ISD::FLOG10, VT, Expand);
678 setOperationAction(ISD::FEXP, VT, Expand);
679 setOperationAction(ISD::FEXP2, VT, Expand);
682 // First set operation action for all vector types to either promote
683 // (for widening) or expand (for scalarization). Then we will selectively
684 // turn on ones that can be effectively codegen'd.
685 for (MVT VT : MVT::vector_valuetypes()) {
686 setOperationAction(ISD::SDIV, VT, Expand);
687 setOperationAction(ISD::UDIV, VT, Expand);
688 setOperationAction(ISD::SREM, VT, Expand);
689 setOperationAction(ISD::UREM, VT, Expand);
690 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT,Expand);
691 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Expand);
692 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT,Expand);
693 setOperationAction(ISD::INSERT_SUBVECTOR, VT,Expand);
694 setOperationAction(ISD::FMA, VT, Expand);
695 setOperationAction(ISD::FFLOOR, VT, Expand);
696 setOperationAction(ISD::FCEIL, VT, Expand);
697 setOperationAction(ISD::FTRUNC, VT, Expand);
698 setOperationAction(ISD::FRINT, VT, Expand);
699 setOperationAction(ISD::FNEARBYINT, VT, Expand);
700 setOperationAction(ISD::SMUL_LOHI, VT, Expand);
701 setOperationAction(ISD::MULHS, VT, Expand);
702 setOperationAction(ISD::UMUL_LOHI, VT, Expand);
703 setOperationAction(ISD::MULHU, VT, Expand);
704 setOperationAction(ISD::SDIVREM, VT, Expand);
705 setOperationAction(ISD::UDIVREM, VT, Expand);
706 setOperationAction(ISD::CTPOP, VT, Expand);
707 setOperationAction(ISD::CTTZ, VT, Expand);
708 setOperationAction(ISD::CTLZ, VT, Expand);
709 setOperationAction(ISD::ROTL, VT, Expand);
710 setOperationAction(ISD::ROTR, VT, Expand);
711 setOperationAction(ISD::BSWAP, VT, Expand);
712 setOperationAction(ISD::SETCC, VT, Expand);
713 setOperationAction(ISD::FP_TO_UINT, VT, Expand);
714 setOperationAction(ISD::FP_TO_SINT, VT, Expand);
715 setOperationAction(ISD::UINT_TO_FP, VT, Expand);
716 setOperationAction(ISD::SINT_TO_FP, VT, Expand);
717 setOperationAction(ISD::SIGN_EXTEND_INREG, VT,Expand);
718 setOperationAction(ISD::TRUNCATE, VT, Expand);
719 setOperationAction(ISD::SIGN_EXTEND, VT, Expand);
720 setOperationAction(ISD::ZERO_EXTEND, VT, Expand);
721 setOperationAction(ISD::ANY_EXTEND, VT, Expand);
722 setOperationAction(ISD::SELECT_CC, VT, Expand);
723 for (MVT InnerVT : MVT::vector_valuetypes()) {
724 setTruncStoreAction(InnerVT, VT, Expand);
726 setLoadExtAction(ISD::SEXTLOAD, InnerVT, VT, Expand);
727 setLoadExtAction(ISD::ZEXTLOAD, InnerVT, VT, Expand);
729 // N.b. ISD::EXTLOAD legality is basically ignored except for i1-like
730 // types, we have to deal with them whether we ask for Expansion or not.
731 // Setting Expand causes its own optimisation problems though, so leave
733 if (VT.getVectorElementType() == MVT::i1)
734 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
736 // EXTLOAD for MVT::f16 vectors is not legal because f16 vectors are
737 // split/scalarized right now.
738 if (VT.getVectorElementType() == MVT::f16)
739 setLoadExtAction(ISD::EXTLOAD, InnerVT, VT, Expand);
743 // FIXME: In order to prevent SSE instructions being expanded to MMX ones
744 // with -msoft-float, disable use of MMX as well.
745 if (!Subtarget.useSoftFloat() && Subtarget.hasMMX()) {
746 addRegisterClass(MVT::x86mmx, &X86::VR64RegClass);
747 // No operations on x86mmx supported, everything uses intrinsics.
750 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE1()) {
751 addRegisterClass(MVT::v4f32, Subtarget.hasVLX() ? &X86::VR128XRegClass
752 : &X86::VR128RegClass);
754 setOperationAction(ISD::FNEG, MVT::v4f32, Custom);
755 setOperationAction(ISD::FABS, MVT::v4f32, Custom);
756 setOperationAction(ISD::FCOPYSIGN, MVT::v4f32, Custom);
757 setOperationAction(ISD::BUILD_VECTOR, MVT::v4f32, Custom);
758 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v4f32, Custom);
759 setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
760 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
761 setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
762 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
765 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
766 addRegisterClass(MVT::v2f64, Subtarget.hasVLX() ? &X86::VR128XRegClass
767 : &X86::VR128RegClass);
769 // FIXME: Unfortunately, -soft-float and -no-implicit-float mean XMM
770 // registers cannot be used even for integer operations.
771 addRegisterClass(MVT::v16i8, Subtarget.hasVLX() ? &X86::VR128XRegClass
772 : &X86::VR128RegClass);
773 addRegisterClass(MVT::v8i16, Subtarget.hasVLX() ? &X86::VR128XRegClass
774 : &X86::VR128RegClass);
775 addRegisterClass(MVT::v4i32, Subtarget.hasVLX() ? &X86::VR128XRegClass
776 : &X86::VR128RegClass);
777 addRegisterClass(MVT::v2i64, Subtarget.hasVLX() ? &X86::VR128XRegClass
778 : &X86::VR128RegClass);
780 setOperationAction(ISD::MUL, MVT::v16i8, Custom);
781 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
782 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
783 setOperationAction(ISD::UMUL_LOHI, MVT::v4i32, Custom);
784 setOperationAction(ISD::SMUL_LOHI, MVT::v4i32, Custom);
785 setOperationAction(ISD::MULHU, MVT::v16i8, Custom);
786 setOperationAction(ISD::MULHS, MVT::v16i8, Custom);
787 setOperationAction(ISD::MULHU, MVT::v8i16, Legal);
788 setOperationAction(ISD::MULHS, MVT::v8i16, Legal);
789 setOperationAction(ISD::MUL, MVT::v8i16, Legal);
790 setOperationAction(ISD::FNEG, MVT::v2f64, Custom);
791 setOperationAction(ISD::FABS, MVT::v2f64, Custom);
792 setOperationAction(ISD::FCOPYSIGN, MVT::v2f64, Custom);
794 setOperationAction(ISD::SMAX, MVT::v8i16, Legal);
795 setOperationAction(ISD::UMAX, MVT::v16i8, Legal);
796 setOperationAction(ISD::SMIN, MVT::v8i16, Legal);
797 setOperationAction(ISD::UMIN, MVT::v16i8, Legal);
799 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v8i16, Custom);
800 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
801 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
803 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
804 setOperationAction(ISD::SETCC, VT, Custom);
805 setOperationAction(ISD::CTPOP, VT, Custom);
806 setOperationAction(ISD::CTTZ, VT, Custom);
809 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
810 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
811 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
812 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
813 setOperationAction(ISD::VSELECT, VT, Custom);
814 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
817 // We support custom legalizing of sext and anyext loads for specific
818 // memory vector types which we can load as a scalar (or sequence of
819 // scalars) and extend in-register to a legal 128-bit vector type. For sext
820 // loads these must work with a single scalar load.
821 for (MVT VT : MVT::integer_vector_valuetypes()) {
822 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i8, Custom);
823 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v4i16, Custom);
824 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v8i8, Custom);
825 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i8, Custom);
826 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i16, Custom);
827 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2i32, Custom);
828 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i8, Custom);
829 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4i16, Custom);
830 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8i8, Custom);
833 for (auto VT : { MVT::v2f64, MVT::v2i64 }) {
834 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
835 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
836 setOperationAction(ISD::VSELECT, VT, Custom);
838 if (VT == MVT::v2i64 && !Subtarget.is64Bit())
841 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
842 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
845 // Promote v16i8, v8i16, v4i32 load, select, and, or, xor to v2i64.
846 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
847 setOperationPromotedToType(ISD::AND, VT, MVT::v2i64);
848 setOperationPromotedToType(ISD::OR, VT, MVT::v2i64);
849 setOperationPromotedToType(ISD::XOR, VT, MVT::v2i64);
850 setOperationPromotedToType(ISD::LOAD, VT, MVT::v2i64);
851 setOperationPromotedToType(ISD::SELECT, VT, MVT::v2i64);
854 // Custom lower v2i64 and v2f64 selects.
855 setOperationAction(ISD::SELECT, MVT::v2f64, Custom);
856 setOperationAction(ISD::SELECT, MVT::v2i64, Custom);
858 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
859 setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
861 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
862 setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
864 setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
865 setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
866 setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
868 // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
869 setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
871 setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
872 setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
874 for (MVT VT : MVT::fp_vector_valuetypes())
875 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v2f32, Legal);
877 setOperationAction(ISD::BITCAST, MVT::v2i32, Custom);
878 setOperationAction(ISD::BITCAST, MVT::v4i16, Custom);
879 setOperationAction(ISD::BITCAST, MVT::v8i8, Custom);
881 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Custom);
882 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Custom);
883 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Custom);
885 // In the customized shift lowering, the legal v4i32/v2i64 cases
886 // in AVX2 will be recognized.
887 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
888 setOperationAction(ISD::SRL, VT, Custom);
889 setOperationAction(ISD::SHL, VT, Custom);
890 setOperationAction(ISD::SRA, VT, Custom);
894 if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
895 setOperationAction(ISD::ABS, MVT::v16i8, Legal);
896 setOperationAction(ISD::ABS, MVT::v8i16, Legal);
897 setOperationAction(ISD::ABS, MVT::v4i32, Legal);
898 setOperationAction(ISD::BITREVERSE, MVT::v16i8, Custom);
899 setOperationAction(ISD::CTLZ, MVT::v16i8, Custom);
900 setOperationAction(ISD::CTLZ, MVT::v8i16, Custom);
901 setOperationAction(ISD::CTLZ, MVT::v4i32, Custom);
902 setOperationAction(ISD::CTLZ, MVT::v2i64, Custom);
905 if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
906 for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
907 setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
908 setOperationAction(ISD::FCEIL, RoundedTy, Legal);
909 setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
910 setOperationAction(ISD::FRINT, RoundedTy, Legal);
911 setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
914 setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
915 setOperationAction(ISD::SMAX, MVT::v4i32, Legal);
916 setOperationAction(ISD::UMAX, MVT::v8i16, Legal);
917 setOperationAction(ISD::UMAX, MVT::v4i32, Legal);
918 setOperationAction(ISD::SMIN, MVT::v16i8, Legal);
919 setOperationAction(ISD::SMIN, MVT::v4i32, Legal);
920 setOperationAction(ISD::UMIN, MVT::v8i16, Legal);
921 setOperationAction(ISD::UMIN, MVT::v4i32, Legal);
923 // FIXME: Do we need to handle scalar-to-vector here?
924 setOperationAction(ISD::MUL, MVT::v4i32, Legal);
926 // We directly match byte blends in the backend as they match the VSELECT
928 setOperationAction(ISD::VSELECT, MVT::v16i8, Legal);
930 // SSE41 brings specific instructions for doing vector sign extend even in
931 // cases where we don't have SRA.
932 for (auto VT : { MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
933 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Legal);
934 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Legal);
937 for (MVT VT : MVT::integer_vector_valuetypes()) {
938 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
939 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
940 setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i32, Custom);
943 // SSE41 also has vector sign/zero extending loads, PMOV[SZ]X
944 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
945 setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
946 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
947 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
948 setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
949 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
950 setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i32, Legal);
953 // i8 vectors are custom because the source register and source
954 // source memory operand types are not the same width.
955 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
958 if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
959 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
960 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
961 setOperationAction(ISD::ROTL, VT, Custom);
963 // XOP can efficiently perform BITREVERSE with VPPERM.
964 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 })
965 setOperationAction(ISD::BITREVERSE, VT, Custom);
967 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
968 MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 })
969 setOperationAction(ISD::BITREVERSE, VT, Custom);
972 if (!Subtarget.useSoftFloat() && Subtarget.hasFp256()) {
973 bool HasInt256 = Subtarget.hasInt256();
975 addRegisterClass(MVT::v32i8, Subtarget.hasVLX() ? &X86::VR256XRegClass
976 : &X86::VR256RegClass);
977 addRegisterClass(MVT::v16i16, Subtarget.hasVLX() ? &X86::VR256XRegClass
978 : &X86::VR256RegClass);
979 addRegisterClass(MVT::v8i32, Subtarget.hasVLX() ? &X86::VR256XRegClass
980 : &X86::VR256RegClass);
981 addRegisterClass(MVT::v8f32, Subtarget.hasVLX() ? &X86::VR256XRegClass
982 : &X86::VR256RegClass);
983 addRegisterClass(MVT::v4i64, Subtarget.hasVLX() ? &X86::VR256XRegClass
984 : &X86::VR256RegClass);
985 addRegisterClass(MVT::v4f64, Subtarget.hasVLX() ? &X86::VR256XRegClass
986 : &X86::VR256RegClass);
988 for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
989 setOperationAction(ISD::FFLOOR, VT, Legal);
990 setOperationAction(ISD::FCEIL, VT, Legal);
991 setOperationAction(ISD::FTRUNC, VT, Legal);
992 setOperationAction(ISD::FRINT, VT, Legal);
993 setOperationAction(ISD::FNEARBYINT, VT, Legal);
994 setOperationAction(ISD::FNEG, VT, Custom);
995 setOperationAction(ISD::FABS, VT, Custom);
996 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
999 // (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
1000 // even though v8i16 is a legal type.
1001 setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Promote);
1002 setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
1003 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1005 setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
1006 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1007 setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
1009 setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
1010 setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
1012 for (MVT VT : MVT::fp_vector_valuetypes())
1013 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
1015 // In the customized shift lowering, the legal v8i32/v4i64 cases
1016 // in AVX2 will be recognized.
1017 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1018 setOperationAction(ISD::SRL, VT, Custom);
1019 setOperationAction(ISD::SHL, VT, Custom);
1020 setOperationAction(ISD::SRA, VT, Custom);
1023 setOperationAction(ISD::SELECT, MVT::v4f64, Custom);
1024 setOperationAction(ISD::SELECT, MVT::v4i64, Custom);
1025 setOperationAction(ISD::SELECT, MVT::v8f32, Custom);
1027 for (auto VT : { MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1028 setOperationAction(ISD::SIGN_EXTEND, VT, Custom);
1029 setOperationAction(ISD::ZERO_EXTEND, VT, Custom);
1030 setOperationAction(ISD::ANY_EXTEND, VT, Custom);
1033 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1034 setOperationAction(ISD::TRUNCATE, MVT::v8i16, Custom);
1035 setOperationAction(ISD::TRUNCATE, MVT::v4i32, Custom);
1036 setOperationAction(ISD::BITREVERSE, MVT::v32i8, Custom);
1038 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1039 setOperationAction(ISD::SETCC, VT, Custom);
1040 setOperationAction(ISD::CTPOP, VT, Custom);
1041 setOperationAction(ISD::CTTZ, VT, Custom);
1042 setOperationAction(ISD::CTLZ, VT, Custom);
1045 if (Subtarget.hasAnyFMA()) {
1046 for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
1047 MVT::v2f64, MVT::v4f64 })
1048 setOperationAction(ISD::FMA, VT, Legal);
1051 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
1052 setOperationAction(ISD::ADD, VT, HasInt256 ? Legal : Custom);
1053 setOperationAction(ISD::SUB, VT, HasInt256 ? Legal : Custom);
1056 setOperationAction(ISD::MUL, MVT::v4i64, Custom);
1057 setOperationAction(ISD::MUL, MVT::v8i32, HasInt256 ? Legal : Custom);
1058 setOperationAction(ISD::MUL, MVT::v16i16, HasInt256 ? Legal : Custom);
1059 setOperationAction(ISD::MUL, MVT::v32i8, Custom);
1061 setOperationAction(ISD::UMUL_LOHI, MVT::v8i32, Custom);
1062 setOperationAction(ISD::SMUL_LOHI, MVT::v8i32, Custom);
1064 setOperationAction(ISD::MULHU, MVT::v16i16, HasInt256 ? Legal : Custom);
1065 setOperationAction(ISD::MULHS, MVT::v16i16, HasInt256 ? Legal : Custom);
1066 setOperationAction(ISD::MULHU, MVT::v32i8, Custom);
1067 setOperationAction(ISD::MULHS, MVT::v32i8, Custom);
1069 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1070 setOperationAction(ISD::ABS, VT, HasInt256 ? Legal : Custom);
1071 setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
1072 setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
1073 setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
1074 setOperationAction(ISD::UMIN, VT, HasInt256 ? Legal : Custom);
1078 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i64, Custom);
1079 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i32, Custom);
1080 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i16, Custom);
1082 // The custom lowering for UINT_TO_FP for v8i32 becomes interesting
1083 // when we have a 256bit-wide blend with immediate.
1084 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
1086 // AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
1087 for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
1088 setLoadExtAction(LoadExtOp, MVT::v16i16, MVT::v16i8, Legal);
1089 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i8, Legal);
1090 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i8, Legal);
1091 setLoadExtAction(LoadExtOp, MVT::v8i32, MVT::v8i16, Legal);
1092 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i16, Legal);
1093 setLoadExtAction(LoadExtOp, MVT::v4i64, MVT::v4i32, Legal);
1097 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1098 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1099 setOperationAction(ISD::MLOAD, VT, Legal);
1100 setOperationAction(ISD::MSTORE, VT, Legal);
1103 // Extract subvector is special because the value type
1104 // (result) is 128-bit but the source is 256-bit wide.
1105 for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
1106 MVT::v4f32, MVT::v2f64 }) {
1107 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1110 // Custom lower several nodes for 256-bit types.
1111 for (MVT VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1112 MVT::v8f32, MVT::v4f64 }) {
1113 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1114 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1115 setOperationAction(ISD::VSELECT, VT, Custom);
1116 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1117 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1118 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1119 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1120 setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
1124 setOperationAction(ISD::VSELECT, MVT::v32i8, Legal);
1126 // Promote v32i8, v16i16, v8i32 select, and, or, xor to v4i64.
1127 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
1128 setOperationPromotedToType(ISD::AND, VT, MVT::v4i64);
1129 setOperationPromotedToType(ISD::OR, VT, MVT::v4i64);
1130 setOperationPromotedToType(ISD::XOR, VT, MVT::v4i64);
1131 setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
1132 setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
1136 if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
1137 addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
1138 addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
1139 addRegisterClass(MVT::v8i64, &X86::VR512RegClass);
1140 addRegisterClass(MVT::v8f64, &X86::VR512RegClass);
1142 addRegisterClass(MVT::v1i1, &X86::VK1RegClass);
1143 addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
1144 addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
1146 for (MVT VT : MVT::fp_vector_valuetypes())
1147 setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
1149 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1150 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
1151 setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
1152 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1153 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
1154 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
1155 setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
1158 for (MVT VT : {MVT::v2i64, MVT::v4i32, MVT::v8i32, MVT::v4i64, MVT::v8i16,
1159 MVT::v16i8, MVT::v16i16, MVT::v32i8, MVT::v16i32,
1160 MVT::v8i64, MVT::v32i16, MVT::v64i8}) {
1161 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
1162 setLoadExtAction(ISD::SEXTLOAD, VT, MaskVT, Custom);
1163 setLoadExtAction(ISD::ZEXTLOAD, VT, MaskVT, Custom);
1164 setLoadExtAction(ISD::EXTLOAD, VT, MaskVT, Custom);
1165 setTruncStoreAction(VT, MaskVT, Custom);
1168 for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
1169 setOperationAction(ISD::FNEG, VT, Custom);
1170 setOperationAction(ISD::FABS, VT, Custom);
1171 setOperationAction(ISD::FMA, VT, Legal);
1172 setOperationAction(ISD::FCOPYSIGN, VT, Custom);
1175 setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
1176 setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
1177 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1178 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1179 setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
1180 setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
1181 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1182 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1183 setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
1184 setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
1185 setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
1186 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1187 setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
1188 setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
1189 setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
1190 setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
1191 setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
1192 setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
1193 setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
1194 setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
1195 setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
1196 setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
1197 setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
1198 setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
1199 setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
1201 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
1202 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
1203 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
1204 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
1205 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
1206 if (Subtarget.hasVLX()){
1207 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
1208 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
1209 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
1210 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
1211 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
1213 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
1214 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
1215 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
1216 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
1217 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
1219 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1220 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
1221 setOperationAction(ISD::MLOAD, VT, Custom);
1222 setOperationAction(ISD::MSTORE, VT, Custom);
1225 setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
1226 setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
1228 if (Subtarget.hasDQI()) {
1229 for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
1230 setOperationAction(ISD::SINT_TO_FP, VT, Legal);
1231 setOperationAction(ISD::UINT_TO_FP, VT, Legal);
1232 setOperationAction(ISD::FP_TO_SINT, VT, Legal);
1233 setOperationAction(ISD::FP_TO_UINT, VT, Legal);
1235 if (Subtarget.hasVLX()) {
1236 // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
1237 setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
1238 setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
1239 setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
1242 if (Subtarget.hasVLX()) {
1243 setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
1244 setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
1245 setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
1246 setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
1247 setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
1248 setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
1249 setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
1250 setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
1251 setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
1252 setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
1253 setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
1255 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1256 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
1257 setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
1258 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
1259 setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
1260 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
1261 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
1262 setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
1263 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
1264 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
1265 setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
1268 setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
1269 setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
1270 setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
1271 setOperationAction(ISD::ANY_EXTEND, MVT::v16i32, Custom);
1272 setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
1273 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
1274 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
1275 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
1276 setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
1277 setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
1279 for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
1280 setOperationAction(ISD::FFLOOR, VT, Legal);
1281 setOperationAction(ISD::FCEIL, VT, Legal);
1282 setOperationAction(ISD::FTRUNC, VT, Legal);
1283 setOperationAction(ISD::FRINT, VT, Legal);
1284 setOperationAction(ISD::FNEARBYINT, VT, Legal);
1287 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i64, Custom);
1288 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v16i32, Custom);
1290 // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
1291 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1292 setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v64i8, Custom);
1294 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8f64, Custom);
1295 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
1296 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
1297 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
1298 setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
1300 setOperationAction(ISD::MUL, MVT::v8i64, Custom);
1302 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
1303 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
1304 setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
1305 setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
1306 setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
1307 setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
1309 setOperationAction(ISD::MUL, MVT::v16i32, Legal);
1311 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1312 setOperationAction(ISD::ABS, MVT::v4i64, Legal);
1313 setOperationAction(ISD::ABS, MVT::v2i64, Legal);
1315 for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
1316 setOperationAction(ISD::ADD, VT, Custom);
1317 setOperationAction(ISD::SUB, VT, Custom);
1318 setOperationAction(ISD::MUL, VT, Custom);
1319 setOperationAction(ISD::SETCC, VT, Custom);
1320 setOperationAction(ISD::SELECT, VT, Custom);
1321 setOperationAction(ISD::TRUNCATE, VT, Custom);
1323 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1324 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1325 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1326 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1327 setOperationAction(ISD::VSELECT, VT, Expand);
1330 for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
1331 setOperationAction(ISD::SMAX, VT, Legal);
1332 setOperationAction(ISD::UMAX, VT, Legal);
1333 setOperationAction(ISD::SMIN, VT, Legal);
1334 setOperationAction(ISD::UMIN, VT, Legal);
1335 setOperationAction(ISD::ABS, VT, Legal);
1336 setOperationAction(ISD::SRL, VT, Custom);
1337 setOperationAction(ISD::SHL, VT, Custom);
1338 setOperationAction(ISD::SRA, VT, Custom);
1339 setOperationAction(ISD::CTPOP, VT, Custom);
1340 setOperationAction(ISD::CTTZ, VT, Custom);
1343 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1344 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
1346 setOperationAction(ISD::ROTL, VT, Custom);
1347 setOperationAction(ISD::ROTR, VT, Custom);
1350 // Need to promote to 64-bit even though we have 32-bit masked instructions
1351 // because the IR optimizers rearrange bitcasts around logic ops leaving
1352 // too many variations to handle if we don't promote them.
1353 setOperationPromotedToType(ISD::AND, MVT::v16i32, MVT::v8i64);
1354 setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
1355 setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
1357 if (Subtarget.hasCDI()) {
1358 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1359 for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
1360 MVT::v4i64, MVT::v8i64}) {
1361 setOperationAction(ISD::CTLZ, VT, Legal);
1362 setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
1364 } // Subtarget.hasCDI()
1366 if (Subtarget.hasDQI()) {
1367 // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
1368 setOperationAction(ISD::MUL, MVT::v2i64, Legal);
1369 setOperationAction(ISD::MUL, MVT::v4i64, Legal);
1370 setOperationAction(ISD::MUL, MVT::v8i64, Legal);
1373 if (Subtarget.hasVPOPCNTDQ()) {
1374 // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
1375 // version of popcntd/q.
1376 for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
1377 MVT::v4i32, MVT::v2i64})
1378 setOperationAction(ISD::CTPOP, VT, Legal);
1381 // Custom lower several nodes.
1382 for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
1383 MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
1384 setOperationAction(ISD::MGATHER, VT, Custom);
1385 setOperationAction(ISD::MSCATTER, VT, Custom);
1387 // Extract subvector is special because the value type
1388 // (result) is 256-bit but the source is 512-bit wide.
1389 // 128-bit was made Custom under AVX1.
1390 for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
1391 MVT::v8f32, MVT::v4f64, MVT::v1i1 })
1392 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
1393 for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
1394 MVT::v16i1, MVT::v32i1, MVT::v64i1 })
1395 setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
1397 for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
1398 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1399 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1400 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1401 setOperationAction(ISD::VSELECT, VT, Custom);
1402 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1403 setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom);
1404 setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
1405 setOperationAction(ISD::MLOAD, VT, Legal);
1406 setOperationAction(ISD::MSTORE, VT, Legal);
1407 setOperationAction(ISD::MGATHER, VT, Legal);
1408 setOperationAction(ISD::MSCATTER, VT, Custom);
1410 for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
1411 setOperationPromotedToType(ISD::LOAD, VT, MVT::v8i64);
1412 setOperationPromotedToType(ISD::SELECT, VT, MVT::v8i64);
1416 if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
1417 addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
1418 addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
1420 addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
1421 addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
1423 setOperationAction(ISD::ADD, MVT::v32i1, Custom);
1424 setOperationAction(ISD::ADD, MVT::v64i1, Custom);
1425 setOperationAction(ISD::SUB, MVT::v32i1, Custom);
1426 setOperationAction(ISD::SUB, MVT::v64i1, Custom);
1427 setOperationAction(ISD::MUL, MVT::v32i1, Custom);
1428 setOperationAction(ISD::MUL, MVT::v64i1, Custom);
1430 setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
1431 setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
1432 setOperationAction(ISD::MUL, MVT::v32i16, Legal);
1433 setOperationAction(ISD::MUL, MVT::v64i8, Custom);
1434 setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
1435 setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
1436 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
1437 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
1438 setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
1439 setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
1440 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
1441 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
1442 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
1443 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
1444 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
1445 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
1446 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
1447 setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
1448 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
1449 setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
1450 setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
1451 setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
1452 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
1453 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
1454 setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
1455 setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
1456 setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
1457 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
1458 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
1459 setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
1460 setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
1461 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
1462 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
1463 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
1464 setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
1465 setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
1466 setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
1467 setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
1468 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
1469 setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
1470 setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
1471 setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
1472 setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
1473 setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
1474 setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
1476 setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
1478 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
1479 if (Subtarget.hasVLX()) {
1480 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
1481 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
1484 LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
1485 for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
1486 setOperationAction(ISD::MLOAD, VT, Action);
1487 setOperationAction(ISD::MSTORE, VT, Action);
1490 if (Subtarget.hasCDI()) {
1491 setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
1492 setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
1495 for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
1496 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1497 setOperationAction(ISD::VSELECT, VT, Custom);
1498 setOperationAction(ISD::ABS, VT, Legal);
1499 setOperationAction(ISD::SRL, VT, Custom);
1500 setOperationAction(ISD::SHL, VT, Custom);
1501 setOperationAction(ISD::SRA, VT, Custom);
1502 setOperationAction(ISD::MLOAD, VT, Legal);
1503 setOperationAction(ISD::MSTORE, VT, Legal);
1504 setOperationAction(ISD::CTPOP, VT, Custom);
1505 setOperationAction(ISD::CTTZ, VT, Custom);
1506 setOperationAction(ISD::SMAX, VT, Legal);
1507 setOperationAction(ISD::UMAX, VT, Legal);
1508 setOperationAction(ISD::SMIN, VT, Legal);
1509 setOperationAction(ISD::UMIN, VT, Legal);
1511 setOperationPromotedToType(ISD::AND, VT, MVT::v8i64);
1512 setOperationPromotedToType(ISD::OR, VT, MVT::v8i64);
1513 setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
1516 for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
1517 setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
1518 if (Subtarget.hasVLX()) {
1519 // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
1520 setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
1521 setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
1526 if (!Subtarget.useSoftFloat() && Subtarget.hasVLX()) {
1527 addRegisterClass(MVT::v4i1, &X86::VK4RegClass);
1528 addRegisterClass(MVT::v2i1, &X86::VK2RegClass);
1530 for (auto VT : { MVT::v2i1, MVT::v4i1 }) {
1531 setOperationAction(ISD::ADD, VT, Custom);
1532 setOperationAction(ISD::SUB, VT, Custom);
1533 setOperationAction(ISD::MUL, VT, Custom);
1534 setOperationAction(ISD::VSELECT, VT, Expand);
1536 setOperationAction(ISD::TRUNCATE, VT, Custom);
1537 setOperationAction(ISD::SETCC, VT, Custom);
1538 setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
1539 setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
1540 setOperationAction(ISD::SELECT, VT, Custom);
1541 setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
1542 setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
1545 setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
1546 setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
1547 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
1548 setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
1550 for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
1551 setOperationAction(ISD::SMAX, VT, Legal);
1552 setOperationAction(ISD::UMAX, VT, Legal);
1553 setOperationAction(ISD::SMIN, VT, Legal);
1554 setOperationAction(ISD::UMIN, VT, Legal);
1558 // We want to custom lower some of our intrinsics.
1559 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
1560 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
1561 setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
1562 if (!Subtarget.is64Bit()) {
1563 setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
1564 setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
1567 // Only custom-lower 64-bit SADDO and friends on 64-bit because we don't
1568 // handle type legalization for these operations here.
1570 // FIXME: We really should do custom legalization for addition and
1571 // subtraction on x86-32 once PR3203 is fixed. We really can't do much better
1572 // than generic legalization for 64-bit multiplication-with-overflow, though.
1573 for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
1574 if (VT == MVT::i64 && !Subtarget.is64Bit())
1576 // Add/Sub/Mul with overflow operations are custom lowered.
1577 setOperationAction(ISD::SADDO, VT, Custom);
1578 setOperationAction(ISD::UADDO, VT, Custom);
1579 setOperationAction(ISD::SSUBO, VT, Custom);
1580 setOperationAction(ISD::USUBO, VT, Custom);
1581 setOperationAction(ISD::SMULO, VT, Custom);
1582 setOperationAction(ISD::UMULO, VT, Custom);
1584 // Support carry in as value rather than glue.
1585 setOperationAction(ISD::ADDCARRY, VT, Custom);
1586 setOperationAction(ISD::SUBCARRY, VT, Custom);
1587 setOperationAction(ISD::SETCCCARRY, VT, Custom);
1590 if (!Subtarget.is64Bit()) {
1591 // These libcalls are not available in 32-bit.
1592 setLibcallName(RTLIB::SHL_I128, nullptr);
1593 setLibcallName(RTLIB::SRL_I128, nullptr);
1594 setLibcallName(RTLIB::SRA_I128, nullptr);
1597 // Combine sin / cos into one node or libcall if possible.
1598 if (Subtarget.hasSinCos()) {
1599 setLibcallName(RTLIB::SINCOS_F32, "sincosf");
1600 setLibcallName(RTLIB::SINCOS_F64, "sincos");
1601 if (Subtarget.isTargetDarwin()) {
1602 // For MacOSX, we don't want the normal expansion of a libcall to sincos.
1603 // We want to issue a libcall to __sincos_stret to avoid memory traffic.
1604 setOperationAction(ISD::FSINCOS, MVT::f64, Custom);
1605 setOperationAction(ISD::FSINCOS, MVT::f32, Custom);
1609 if (Subtarget.isTargetWin64()) {
1610 setOperationAction(ISD::SDIV, MVT::i128, Custom);
1611 setOperationAction(ISD::UDIV, MVT::i128, Custom);
1612 setOperationAction(ISD::SREM, MVT::i128, Custom);
1613 setOperationAction(ISD::UREM, MVT::i128, Custom);
1614 setOperationAction(ISD::SDIVREM, MVT::i128, Custom);
1615 setOperationAction(ISD::UDIVREM, MVT::i128, Custom);
1618 // On 32 bit MSVC, `fmodf(f32)` is not defined - only `fmod(f64)`
1619 // is. We should promote the value to 64-bits to solve this.
1620 // This is what the CRT headers do - `fmodf` is an inline header
1621 // function casting to f64 and calling `fmod`.
1622 if (Subtarget.is32Bit() && (Subtarget.isTargetKnownWindowsMSVC() ||
1623 Subtarget.isTargetWindowsItanium()))
1624 for (ISD::NodeType Op :
1625 {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
1626 ISD::FLOG10, ISD::FPOW, ISD::FSIN})
1627 if (isOperationExpand(Op, MVT::f32))
1628 setOperationAction(Op, MVT::f32, Promote);
1630 // We have target-specific dag combine patterns for the following nodes:
1631 setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
1632 setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
1633 setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
1634 setTargetDAGCombine(ISD::BITCAST);
1635 setTargetDAGCombine(ISD::VSELECT);
1636 setTargetDAGCombine(ISD::SELECT);
1637 setTargetDAGCombine(ISD::SHL);
1638 setTargetDAGCombine(ISD::SRA);
1639 setTargetDAGCombine(ISD::SRL);
1640 setTargetDAGCombine(ISD::OR);
1641 setTargetDAGCombine(ISD::AND);
1642 setTargetDAGCombine(ISD::ADD);
1643 setTargetDAGCombine(ISD::FADD);
1644 setTargetDAGCombine(ISD::FSUB);
1645 setTargetDAGCombine(ISD::FNEG);
1646 setTargetDAGCombine(ISD::FMA);
1647 setTargetDAGCombine(ISD::FMINNUM);
1648 setTargetDAGCombine(ISD::FMAXNUM);
1649 setTargetDAGCombine(ISD::SUB);
1650 setTargetDAGCombine(ISD::LOAD);
1651 setTargetDAGCombine(ISD::MLOAD);
1652 setTargetDAGCombine(ISD::STORE);
1653 setTargetDAGCombine(ISD::MSTORE);
1654 setTargetDAGCombine(ISD::TRUNCATE);
1655 setTargetDAGCombine(ISD::ZERO_EXTEND);
1656 setTargetDAGCombine(ISD::ANY_EXTEND);
1657 setTargetDAGCombine(ISD::SIGN_EXTEND);
1658 setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
1659 setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
1660 setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
1661 setTargetDAGCombine(ISD::SINT_TO_FP);
1662 setTargetDAGCombine(ISD::UINT_TO_FP);
1663 setTargetDAGCombine(ISD::SETCC);
1664 setTargetDAGCombine(ISD::MUL);
1665 setTargetDAGCombine(ISD::XOR);
1666 setTargetDAGCombine(ISD::MSCATTER);
1667 setTargetDAGCombine(ISD::MGATHER);
1669 computeRegisterProperties(Subtarget.getRegisterInfo());
1671 MaxStoresPerMemset = 16; // For @llvm.memset -> sequence of stores
1672 MaxStoresPerMemsetOptSize = 8;
1673 MaxStoresPerMemcpy = 8; // For @llvm.memcpy -> sequence of stores
1674 MaxStoresPerMemcpyOptSize = 4;
1675 MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
1676 MaxStoresPerMemmoveOptSize = 4;
1678 // TODO: These control memcmp expansion in CGP and could be raised higher, but
1679 // that needs to benchmarked and balanced with the potential use of vector
1680 // load/store types (PR33329, PR33914).
1681 MaxLoadsPerMemcmp = 2;
1682 MaxLoadsPerMemcmpOptSize = 2;
1684 // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
1685 setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
1687 // An out-of-order CPU can speculatively execute past a predictable branch,
1688 // but a conditional move could be stalled by an expensive earlier operation.
1689 PredictableSelectIsExpensive = Subtarget.getSchedModel().isOutOfOrder();
1690 EnableExtLdPromotion = true;
1691 setPrefFunctionAlignment(4); // 2^4 bytes.
1693 verifyIntrinsicTables();
1696 // This has so far only been implemented for 64-bit MachO.
1697 bool X86TargetLowering::useLoadStackGuardNode() const {
1698 return Subtarget.isTargetMachO() && Subtarget.is64Bit();
1701 TargetLoweringBase::LegalizeTypeAction
1702 X86TargetLowering::getPreferredVectorAction(EVT VT) const {
1703 if (ExperimentalVectorWideningLegalization &&
1704 VT.getVectorNumElements() != 1 &&
1705 VT.getVectorElementType().getSimpleVT() != MVT::i1)
1706 return TypeWidenVector;
1708 return TargetLoweringBase::getPreferredVectorAction(VT);
1711 EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
1712 LLVMContext& Context,
1717 if (VT.isSimple()) {
1718 MVT VVT = VT.getSimpleVT();
1719 const unsigned NumElts = VVT.getVectorNumElements();
1720 MVT EltVT = VVT.getVectorElementType();
1721 if (VVT.is512BitVector()) {
1722 if (Subtarget.hasAVX512())
1723 if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
1724 EltVT == MVT::f32 || EltVT == MVT::f64)
1726 case 8: return MVT::v8i1;
1727 case 16: return MVT::v16i1;
1729 if (Subtarget.hasBWI())
1730 if (EltVT == MVT::i8 || EltVT == MVT::i16)
1732 case 32: return MVT::v32i1;
1733 case 64: return MVT::v64i1;
1737 if (Subtarget.hasBWI() && Subtarget.hasVLX())
1738 return MVT::getVectorVT(MVT::i1, NumElts);
1740 if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
1741 EVT LegalVT = getTypeToTransformTo(Context, VT);
1742 EltVT = LegalVT.getVectorElementType().getSimpleVT();
1745 if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
1747 case 2: return MVT::v2i1;
1748 case 4: return MVT::v4i1;
1749 case 8: return MVT::v8i1;
1753 return VT.changeVectorElementTypeToInteger();
1756 /// Helper for getByValTypeAlignment to determine
1757 /// the desired ByVal argument alignment.
1758 static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
1761 if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
1762 if (VTy->getBitWidth() == 128)
1764 } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
1765 unsigned EltAlign = 0;
1766 getMaxByValAlign(ATy->getElementType(), EltAlign);
1767 if (EltAlign > MaxAlign)
1768 MaxAlign = EltAlign;
1769 } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
1770 for (auto *EltTy : STy->elements()) {
1771 unsigned EltAlign = 0;
1772 getMaxByValAlign(EltTy, EltAlign);
1773 if (EltAlign > MaxAlign)
1774 MaxAlign = EltAlign;
1781 /// Return the desired alignment for ByVal aggregate
1782 /// function arguments in the caller parameter area. For X86, aggregates
1783 /// that contain SSE vectors are placed at 16-byte boundaries while the rest
1784 /// are at 4-byte boundaries.
1785 unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
1786 const DataLayout &DL) const {
1787 if (Subtarget.is64Bit()) {
1788 // Max of 8 and alignment of type.
1789 unsigned TyAlign = DL.getABITypeAlignment(Ty);
1796 if (Subtarget.hasSSE1())
1797 getMaxByValAlign(Ty, Align);
1801 /// Returns the target specific optimal type for load
1802 /// and store operations as a result of memset, memcpy, and memmove
1803 /// lowering. If DstAlign is zero that means it's safe to destination
1804 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
1805 /// means there isn't a need to check it against alignment requirement,
1806 /// probably because the source does not need to be loaded. If 'IsMemset' is
1807 /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
1808 /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
1809 /// source is constant so it does not need to be loaded.
1810 /// It returns EVT::Other if the type should be determined using generic
1811 /// target-independent logic.
1813 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
1814 unsigned DstAlign, unsigned SrcAlign,
1815 bool IsMemset, bool ZeroMemset,
1817 MachineFunction &MF) const {
1818 const Function *F = MF.getFunction();
1819 if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
1821 (!Subtarget.isUnalignedMem16Slow() ||
1822 ((DstAlign == 0 || DstAlign >= 16) &&
1823 (SrcAlign == 0 || SrcAlign >= 16)))) {
1824 // FIXME: Check if unaligned 32-byte accesses are slow.
1825 if (Size >= 32 && Subtarget.hasAVX()) {
1826 // Although this isn't a well-supported type for AVX1, we'll let
1827 // legalization and shuffle lowering produce the optimal codegen. If we
1828 // choose an optimal type with a vector element larger than a byte,
1829 // getMemsetStores() may create an intermediate splat (using an integer
1830 // multiply) before we splat as a vector.
1833 if (Subtarget.hasSSE2())
1835 // TODO: Can SSE1 handle a byte vector?
1836 if (Subtarget.hasSSE1())
1838 } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
1839 !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
1840 // Do not use f64 to lower memcpy if source is string constant. It's
1841 // better to use i32 to avoid the loads.
1842 // Also, do not use f64 to lower memset unless this is a memset of zeros.
1843 // The gymnastics of splatting a byte value into an XMM register and then
1844 // only using 8-byte stores (because this is a CPU with slow unaligned
1845 // 16-byte accesses) makes that a loser.
1849 // This is a compromise. If we reach here, unaligned accesses may be slow on
1850 // this target. However, creating smaller, aligned accesses could be even
1851 // slower and would certainly be a lot more code.
1852 if (Subtarget.is64Bit() && Size >= 8)
1857 bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
1859 return X86ScalarSSEf32;
1860 else if (VT == MVT::f64)
1861 return X86ScalarSSEf64;
1866 X86TargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
1871 switch (VT.getSizeInBits()) {
1873 // 8-byte and under are always assumed to be fast.
1877 *Fast = !Subtarget.isUnalignedMem16Slow();
1880 *Fast = !Subtarget.isUnalignedMem32Slow();
1882 // TODO: What about AVX-512 (512-bit) accesses?
1885 // Misaligned accesses of any size are always allowed.
1889 /// Return the entry encoding for a jump table in the
1890 /// current function. The returned value is a member of the
1891 /// MachineJumpTableInfo::JTEntryKind enum.
1892 unsigned X86TargetLowering::getJumpTableEncoding() const {
1893 // In GOT pic mode, each entry in the jump table is emitted as a @GOTOFF
1895 if (isPositionIndependent() && Subtarget.isPICStyleGOT())
1896 return MachineJumpTableInfo::EK_Custom32;
1898 // Otherwise, use the normal jump table encoding heuristics.
1899 return TargetLowering::getJumpTableEncoding();
1902 bool X86TargetLowering::useSoftFloat() const {
1903 return Subtarget.useSoftFloat();
1906 void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
1907 ArgListTy &Args) const {
1909 // Only relabel X86-32 for C / Stdcall CCs.
1910 if (Subtarget.is64Bit())
1912 if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
1914 unsigned ParamRegs = 0;
1915 if (auto *M = MF->getFunction()->getParent())
1916 ParamRegs = M->getNumberRegisterParameters();
1918 // Mark the first N int arguments as having reg
1919 for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
1920 Type *T = Args[Idx].Ty;
1921 if (T->isPointerTy() || T->isIntegerTy())
1922 if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
1923 unsigned numRegs = 1;
1924 if (MF->getDataLayout().getTypeAllocSize(T) > 4)
1926 if (ParamRegs < numRegs)
1928 ParamRegs -= numRegs;
1929 Args[Idx].IsInReg = true;
1935 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
1936 const MachineBasicBlock *MBB,
1937 unsigned uid,MCContext &Ctx) const{
1938 assert(isPositionIndependent() && Subtarget.isPICStyleGOT());
1939 // In 32-bit ELF systems, our jump table entries are formed with @GOTOFF
1941 return MCSymbolRefExpr::create(MBB->getSymbol(),
1942 MCSymbolRefExpr::VK_GOTOFF, Ctx);
1945 /// Returns relocation base for the given PIC jumptable.
1946 SDValue X86TargetLowering::getPICJumpTableRelocBase(SDValue Table,
1947 SelectionDAG &DAG) const {
1948 if (!Subtarget.is64Bit())
1949 // This doesn't have SDLoc associated with it, but is not really the
1950 // same as a Register.
1951 return DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
1952 getPointerTy(DAG.getDataLayout()));
1956 /// This returns the relocation base for the given PIC jumptable,
1957 /// the same as getPICJumpTableRelocBase, but as an MCExpr.
1958 const MCExpr *X86TargetLowering::
1959 getPICJumpTableRelocBaseExpr(const MachineFunction *MF, unsigned JTI,
1960 MCContext &Ctx) const {
1961 // X86-64 uses RIP relative addressing based on the jump table label.
1962 if (Subtarget.isPICStyleRIPRel())
1963 return TargetLowering::getPICJumpTableRelocBaseExpr(MF, JTI, Ctx);
1965 // Otherwise, the reference is relative to the PIC base.
1966 return MCSymbolRefExpr::create(MF->getPICBaseSymbol(), Ctx);
1969 std::pair<const TargetRegisterClass *, uint8_t>
1970 X86TargetLowering::findRepresentativeClass(const TargetRegisterInfo *TRI,
1972 const TargetRegisterClass *RRC = nullptr;
1974 switch (VT.SimpleTy) {
1976 return TargetLowering::findRepresentativeClass(TRI, VT);
1977 case MVT::i8: case MVT::i16: case MVT::i32: case MVT::i64:
1978 RRC = Subtarget.is64Bit() ? &X86::GR64RegClass : &X86::GR32RegClass;
1981 RRC = &X86::VR64RegClass;
1983 case MVT::f32: case MVT::f64:
1984 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1985 case MVT::v4f32: case MVT::v2f64:
1986 case MVT::v32i8: case MVT::v16i16: case MVT::v8i32: case MVT::v4i64:
1987 case MVT::v8f32: case MVT::v4f64:
1988 case MVT::v64i8: case MVT::v32i16: case MVT::v16i32: case MVT::v8i64:
1989 case MVT::v16f32: case MVT::v8f64:
1990 RRC = &X86::VR128XRegClass;
1993 return std::make_pair(RRC, Cost);
1996 unsigned X86TargetLowering::getAddressSpace() const {
1997 if (Subtarget.is64Bit())
1998 return (getTargetMachine().getCodeModel() == CodeModel::Kernel) ? 256 : 257;
2002 static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
2003 return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
2004 (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
2007 static Constant* SegmentOffset(IRBuilder<> &IRB,
2008 unsigned Offset, unsigned AddressSpace) {
2009 return ConstantExpr::getIntToPtr(
2010 ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
2011 Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
2014 Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
2015 // glibc, bionic, and Fuchsia have a special slot for the stack guard in
2016 // tcbhead_t; use it instead of the usual global variable (see
2017 // sysdeps/{i386,x86_64}/nptl/tls.h)
2018 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
2019 if (Subtarget.isTargetFuchsia()) {
2020 // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
2021 return SegmentOffset(IRB, 0x10, getAddressSpace());
2023 // %fs:0x28, unless we're using a Kernel code model, in which case
2024 // it's %gs:0x28. gs:0x14 on i386.
2025 unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
2026 return SegmentOffset(IRB, Offset, getAddressSpace());
2030 return TargetLowering::getIRStackGuard(IRB);
2033 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
2034 // MSVC CRT provides functionalities for stack protection.
2035 if (Subtarget.getTargetTriple().isOSMSVCRT()) {
2036 // MSVC CRT has a global variable holding security cookie.
2037 M.getOrInsertGlobal("__security_cookie",
2038 Type::getInt8PtrTy(M.getContext()));
2040 // MSVC CRT has a function to validate security cookie.
2041 auto *SecurityCheckCookie = cast<Function>(
2042 M.getOrInsertFunction("__security_check_cookie",
2043 Type::getVoidTy(M.getContext()),
2044 Type::getInt8PtrTy(M.getContext())));
2045 SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
2046 SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
2049 // glibc, bionic, and Fuchsia have a special slot for the stack guard.
2050 if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
2052 TargetLowering::insertSSPDeclarations(M);
2055 Value *X86TargetLowering::getSDagStackGuard(const Module &M) const {
2056 // MSVC CRT has a global variable holding security cookie.
2057 if (Subtarget.getTargetTriple().isOSMSVCRT())
2058 return M.getGlobalVariable("__security_cookie");
2059 return TargetLowering::getSDagStackGuard(M);
2062 Value *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
2063 // MSVC CRT has a function to validate security cookie.
2064 if (Subtarget.getTargetTriple().isOSMSVCRT())
2065 return M.getFunction("__security_check_cookie");
2066 return TargetLowering::getSSPStackGuardCheck(M);
2069 Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
2070 if (Subtarget.getTargetTriple().isOSContiki())
2071 return getDefaultSafeStackPointerLocation(IRB, false);
2073 // Android provides a fixed TLS slot for the SafeStack pointer. See the
2074 // definition of TLS_SLOT_SAFESTACK in
2075 // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
2076 if (Subtarget.isTargetAndroid()) {
2077 // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
2079 unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
2080 return SegmentOffset(IRB, Offset, getAddressSpace());
2083 // Fuchsia is similar.
2084 if (Subtarget.isTargetFuchsia()) {
2085 // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
2086 return SegmentOffset(IRB, 0x18, getAddressSpace());
2089 return TargetLowering::getSafeStackPointerLocation(IRB);
2092 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
2093 unsigned DestAS) const {
2094 assert(SrcAS != DestAS && "Expected different address spaces!");
2096 return SrcAS < 256 && DestAS < 256;
2099 //===----------------------------------------------------------------------===//
2100 // Return Value Calling Convention Implementation
2101 //===----------------------------------------------------------------------===//
2103 #include "X86GenCallingConv.inc"
2105 bool X86TargetLowering::CanLowerReturn(
2106 CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
2107 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
2108 SmallVector<CCValAssign, 16> RVLocs;
2109 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2110 return CCInfo.CheckReturn(Outs, RetCC_X86);
2113 const MCPhysReg *X86TargetLowering::getScratchRegisters(CallingConv::ID) const {
2114 static const MCPhysReg ScratchRegs[] = { X86::R11, 0 };
2118 /// Lowers masks values (v*i1) to the local register values
2119 /// \returns DAG node after lowering to register type
2120 static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
2121 const SDLoc &Dl, SelectionDAG &DAG) {
2122 EVT ValVT = ValArg.getValueType();
2124 if ((ValVT == MVT::v8i1 && (ValLoc == MVT::i8 || ValLoc == MVT::i32)) ||
2125 (ValVT == MVT::v16i1 && (ValLoc == MVT::i16 || ValLoc == MVT::i32))) {
2126 // Two stage lowering might be required
2127 // bitcast: v8i1 -> i8 / v16i1 -> i16
2128 // anyextend: i8 -> i32 / i16 -> i32
2129 EVT TempValLoc = ValVT == MVT::v8i1 ? MVT::i8 : MVT::i16;
2130 SDValue ValToCopy = DAG.getBitcast(TempValLoc, ValArg);
2131 if (ValLoc == MVT::i32)
2132 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, Dl, ValLoc, ValToCopy);
2134 } else if ((ValVT == MVT::v32i1 && ValLoc == MVT::i32) ||
2135 (ValVT == MVT::v64i1 && ValLoc == MVT::i64)) {
2136 // One stage lowering is required
2137 // bitcast: v32i1 -> i32 / v64i1 -> i64
2138 return DAG.getBitcast(ValLoc, ValArg);
2140 return DAG.getNode(ISD::SIGN_EXTEND, Dl, ValLoc, ValArg);
2143 /// Breaks v64i1 value into two registers and adds the new node to the DAG
2144 static void Passv64i1ArgInRegs(
2145 const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
2146 SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
2147 CCValAssign &NextVA, const X86Subtarget &Subtarget) {
2148 assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
2149 "Expected AVX512BW or AVX512BMI target!");
2150 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2151 assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
2152 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2153 "The value should reside in two registers");
2155 // Before splitting the value we cast it to i64
2156 Arg = DAG.getBitcast(MVT::i64, Arg);
2158 // Splitting the value into two i32 types
2160 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2161 DAG.getConstant(0, Dl, MVT::i32));
2162 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i32, Arg,
2163 DAG.getConstant(1, Dl, MVT::i32));
2165 // Attach the two i32 types into corresponding registers
2166 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Lo));
2167 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), Hi));
2171 X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2173 const SmallVectorImpl<ISD::OutputArg> &Outs,
2174 const SmallVectorImpl<SDValue> &OutVals,
2175 const SDLoc &dl, SelectionDAG &DAG) const {
2176 MachineFunction &MF = DAG.getMachineFunction();
2177 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2179 // In some cases we need to disable registers from the default CSR list.
2180 // For example, when they are used for argument passing.
2181 bool ShouldDisableCalleeSavedRegister =
2182 CallConv == CallingConv::X86_RegCall ||
2183 MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
2185 if (CallConv == CallingConv::X86_INTR && !Outs.empty())
2186 report_fatal_error("X86 interrupts may not return any value");
2188 SmallVector<CCValAssign, 16> RVLocs;
2189 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
2190 CCInfo.AnalyzeReturn(Outs, RetCC_X86);
2193 SmallVector<SDValue, 6> RetOps;
2194 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2195 // Operand #1 = Bytes To Pop
2196 RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
2199 // Copy the result values into the output registers.
2200 for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
2202 CCValAssign &VA = RVLocs[I];
2203 assert(VA.isRegLoc() && "Can only return in registers!");
2205 // Add the register to the CalleeSaveDisableRegs list.
2206 if (ShouldDisableCalleeSavedRegister)
2207 MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
2209 SDValue ValToCopy = OutVals[OutsIndex];
2210 EVT ValVT = ValToCopy.getValueType();
2212 // Promote values to the appropriate types.
2213 if (VA.getLocInfo() == CCValAssign::SExt)
2214 ValToCopy = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), ValToCopy);
2215 else if (VA.getLocInfo() == CCValAssign::ZExt)
2216 ValToCopy = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), ValToCopy);
2217 else if (VA.getLocInfo() == CCValAssign::AExt) {
2218 if (ValVT.isVector() && ValVT.getVectorElementType() == MVT::i1)
2219 ValToCopy = lowerMasksToReg(ValToCopy, VA.getLocVT(), dl, DAG);
2221 ValToCopy = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), ValToCopy);
2223 else if (VA.getLocInfo() == CCValAssign::BCvt)
2224 ValToCopy = DAG.getBitcast(VA.getLocVT(), ValToCopy);
2226 assert(VA.getLocInfo() != CCValAssign::FPExt &&
2227 "Unexpected FP-extend for return value.");
2229 // If this is x86-64, and we disabled SSE, we can't return FP values,
2230 // or SSE or MMX vectors.
2231 if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
2232 VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
2233 (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
2234 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2235 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2236 } else if (ValVT == MVT::f64 &&
2237 (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
2238 // Likewise we can't return F64 values with SSE1 only. gcc does so, but
2239 // llvm-gcc has never done it right and no one has noticed, so this
2240 // should be OK for now.
2241 errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
2242 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2245 // Returns in ST0/ST1 are handled specially: these are pushed as operands to
2246 // the RET instruction and handled by the FP Stackifier.
2247 if (VA.getLocReg() == X86::FP0 ||
2248 VA.getLocReg() == X86::FP1) {
2249 // If this is a copy from an xmm register to ST(0), use an FPExtend to
2250 // change the value to the FP stack register class.
2251 if (isScalarFPTypeInSSEReg(VA.getValVT()))
2252 ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
2253 RetOps.push_back(ValToCopy);
2254 // Don't emit a copytoreg.
2258 // 64-bit vector (MMX) values are returned in XMM0 / XMM1 except for v1i64
2259 // which is returned in RAX / RDX.
2260 if (Subtarget.is64Bit()) {
2261 if (ValVT == MVT::x86mmx) {
2262 if (VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) {
2263 ValToCopy = DAG.getBitcast(MVT::i64, ValToCopy);
2264 ValToCopy = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
2266 // If we don't have SSE2 available, convert to v4f32 so the generated
2267 // register is legal.
2268 if (!Subtarget.hasSSE2())
2269 ValToCopy = DAG.getBitcast(MVT::v4f32, ValToCopy);
2274 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
2276 if (VA.needsCustom()) {
2277 assert(VA.getValVT() == MVT::v64i1 &&
2278 "Currently the only custom case is when we split v64i1 to 2 regs");
2280 Passv64i1ArgInRegs(dl, DAG, Chain, ValToCopy, RegsToPass, VA, RVLocs[++I],
2283 assert(2 == RegsToPass.size() &&
2284 "Expecting two registers after Pass64BitArgInRegs");
2286 // Add the second register to the CalleeSaveDisableRegs list.
2287 if (ShouldDisableCalleeSavedRegister)
2288 MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
2290 RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
2293 // Add nodes to the DAG and add the values into the RetOps list
2294 for (auto &Reg : RegsToPass) {
2295 Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
2296 Flag = Chain.getValue(1);
2297 RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
2301 // Swift calling convention does not require we copy the sret argument
2302 // into %rax/%eax for the return, and SRetReturnReg is not set for Swift.
2304 // All x86 ABIs require that for returning structs by value we copy
2305 // the sret argument into %rax/%eax (depending on ABI) for the return.
2306 // We saved the argument into a virtual register in the entry block,
2307 // so now we copy the value out and into %rax/%eax.
2309 // Checking Function.hasStructRetAttr() here is insufficient because the IR
2310 // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
2311 // false, then an sret argument may be implicitly inserted in the SelDAG. In
2312 // either case FuncInfo->setSRetReturnReg() will have been called.
2313 if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
2314 // When we have both sret and another return value, we should use the
2315 // original Chain stored in RetOps[0], instead of the current Chain updated
2316 // in the above loop. If we only have sret, RetOps[0] equals to Chain.
2318 // For the case of sret and another return value, we have
2319 // Chain_0 at the function entry
2320 // Chain_1 = getCopyToReg(Chain_0) in the above loop
2321 // If we use Chain_1 in getCopyFromReg, we will have
2322 // Val = getCopyFromReg(Chain_1)
2323 // Chain_2 = getCopyToReg(Chain_1, Val) from below
2325 // getCopyToReg(Chain_0) will be glued together with
2326 // getCopyToReg(Chain_1, Val) into Unit A, getCopyFromReg(Chain_1) will be
2327 // in Unit B, and we will have cyclic dependency between Unit A and Unit B:
2328 // Data dependency from Unit B to Unit A due to usage of Val in
2329 // getCopyToReg(Chain_1, Val)
2330 // Chain dependency from Unit A to Unit B
2332 // So here, we use RetOps[0] (i.e Chain_0) for getCopyFromReg.
2333 SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
2334 getPointerTy(MF.getDataLayout()));
2337 = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
2338 X86::RAX : X86::EAX;
2339 Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
2340 Flag = Chain.getValue(1);
2342 // RAX/EAX now acts like a return value.
2344 DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
2346 // Add the returned register to the CalleeSaveDisableRegs list.
2347 if (ShouldDisableCalleeSavedRegister)
2348 MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
2351 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
2352 const MCPhysReg *I =
2353 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
2356 if (X86::GR64RegClass.contains(*I))
2357 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
2359 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
2363 RetOps[0] = Chain; // Update chain.
2365 // Add the flag if we have it.
2367 RetOps.push_back(Flag);
2369 X86ISD::NodeType opcode = X86ISD::RET_FLAG;
2370 if (CallConv == CallingConv::X86_INTR)
2371 opcode = X86ISD::IRET;
2372 return DAG.getNode(opcode, dl, MVT::Other, RetOps);
2375 bool X86TargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
2376 if (N->getNumValues() != 1 || !N->hasNUsesOfValue(1, 0))
2379 SDValue TCChain = Chain;
2380 SDNode *Copy = *N->use_begin();
2381 if (Copy->getOpcode() == ISD::CopyToReg) {
2382 // If the copy has a glue operand, we conservatively assume it isn't safe to
2383 // perform a tail call.
2384 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
2386 TCChain = Copy->getOperand(0);
2387 } else if (Copy->getOpcode() != ISD::FP_EXTEND)
2390 bool HasRet = false;
2391 for (SDNode::use_iterator UI = Copy->use_begin(), UE = Copy->use_end();
2393 if (UI->getOpcode() != X86ISD::RET_FLAG)
2395 // If we are returning more than one value, we can definitely
2396 // not make a tail call see PR19530
2397 if (UI->getNumOperands() > 4)
2399 if (UI->getNumOperands() == 4 &&
2400 UI->getOperand(UI->getNumOperands()-1).getValueType() != MVT::Glue)
2412 EVT X86TargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
2413 ISD::NodeType ExtendKind) const {
2414 MVT ReturnMVT = MVT::i32;
2416 bool Darwin = Subtarget.getTargetTriple().isOSDarwin();
2417 if (VT == MVT::i1 || (!Darwin && (VT == MVT::i8 || VT == MVT::i16))) {
2418 // The ABI does not require i1, i8 or i16 to be extended.
2420 // On Darwin, there is code in the wild relying on Clang's old behaviour of
2421 // always extending i8/i16 return values, so keep doing that for now.
2423 ReturnMVT = MVT::i8;
2426 EVT MinVT = getRegisterType(Context, ReturnMVT);
2427 return VT.bitsLT(MinVT) ? MinVT : VT;
2430 /// Reads two 32 bit registers and creates a 64 bit mask value.
2431 /// \param VA The current 32 bit value that need to be assigned.
2432 /// \param NextVA The next 32 bit value that need to be assigned.
2433 /// \param Root The parent DAG node.
2434 /// \param [in,out] InFlag Represents SDvalue in the parent DAG node for
2435 /// glue purposes. In the case the DAG is already using
2436 /// physical register instead of virtual, we should glue
2437 /// our new SDValue to InFlag SDvalue.
2438 /// \return a new SDvalue of size 64bit.
2439 static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
2440 SDValue &Root, SelectionDAG &DAG,
2441 const SDLoc &Dl, const X86Subtarget &Subtarget,
2442 SDValue *InFlag = nullptr) {
2443 assert((Subtarget.hasBWI()) && "Expected AVX512BW target!");
2444 assert(Subtarget.is32Bit() && "Expecting 32 bit target");
2445 assert(VA.getValVT() == MVT::v64i1 &&
2446 "Expecting first location of 64 bit width type");
2447 assert(NextVA.getValVT() == VA.getValVT() &&
2448 "The locations should have the same type");
2449 assert(VA.isRegLoc() && NextVA.isRegLoc() &&
2450 "The values should reside in two registers");
2454 SDValue ArgValueLo, ArgValueHi;
2456 MachineFunction &MF = DAG.getMachineFunction();
2457 const TargetRegisterClass *RC = &X86::GR32RegClass;
2459 // Read a 32 bit value from the registers
2460 if (nullptr == InFlag) {
2461 // When no physical register is present,
2462 // create an intermediate virtual register
2463 Reg = MF.addLiveIn(VA.getLocReg(), RC);
2464 ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2465 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
2466 ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
2468 // When a physical register is available read the value from it and glue
2469 // the reads together.
2471 DAG.getCopyFromReg(Root, Dl, VA.getLocReg(), MVT::i32, *InFlag);
2472 *InFlag = ArgValueLo.getValue(2);
2474 DAG.getCopyFromReg(Root, Dl, NextVA.getLocReg(), MVT::i32, *InFlag);
2475 *InFlag = ArgValueHi.getValue(2);
2478 // Convert the i32 type into v32i1 type
2479 Lo = DAG.getBitcast(MVT::v32i1, ArgValueLo);
2481 // Convert the i32 type into v32i1 type
2482 Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
2484 // Concatenate the two values together
2485 return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
2488 /// The function will lower a register of various sizes (8/16/32/64)
2489 /// to a mask value of the expected size (v8i1/v16i1/v32i1/v64i1)
2490 /// \returns a DAG node contains the operand after lowering to mask type.
2491 static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
2492 const EVT &ValLoc, const SDLoc &Dl,
2493 SelectionDAG &DAG) {
2494 SDValue ValReturned = ValArg;
2496 if (ValVT == MVT::v1i1)
2497 return DAG.getNode(ISD::SCALAR_TO_VECTOR, Dl, MVT::v1i1, ValReturned);
2499 if (ValVT == MVT::v64i1) {
2500 // In 32 bit machine, this case is handled by getv64i1Argument
2501 assert(ValLoc == MVT::i64 && "Expecting only i64 locations");
2502 // In 64 bit machine, There is no need to truncate the value only bitcast
2505 switch (ValVT.getSimpleVT().SimpleTy) {
2516 llvm_unreachable("Expecting a vector of i1 types");
2519 ValReturned = DAG.getNode(ISD::TRUNCATE, Dl, maskLen, ValReturned);
2521 return DAG.getBitcast(ValVT, ValReturned);
2524 /// Lower the result values of a call into the
2525 /// appropriate copies out of appropriate physical registers.
2527 SDValue X86TargetLowering::LowerCallResult(
2528 SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
2529 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2530 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
2531 uint32_t *RegMask) const {
2533 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
2534 // Assign locations to each value returned by this call.
2535 SmallVector<CCValAssign, 16> RVLocs;
2536 bool Is64Bit = Subtarget.is64Bit();
2537 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2539 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
2541 // Copy all of the result registers out of their specified physreg.
2542 for (unsigned I = 0, InsIndex = 0, E = RVLocs.size(); I != E;
2544 CCValAssign &VA = RVLocs[I];
2545 EVT CopyVT = VA.getLocVT();
2547 // In some calling conventions we need to remove the used registers
2548 // from the register mask.
2550 for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
2551 SubRegs.isValid(); ++SubRegs)
2552 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
2555 // If this is x86-64, and we disabled SSE, we can't return FP values
2556 if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
2557 ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
2558 errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
2559 VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
2562 // If we prefer to use the value in xmm registers, copy it out as f80 and
2563 // use a truncate to move it from fp stack reg to xmm reg.
2564 bool RoundAfterCopy = false;
2565 if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
2566 isScalarFPTypeInSSEReg(VA.getValVT())) {
2567 if (!Subtarget.hasX87())
2568 report_fatal_error("X87 register return with X87 disabled");
2570 RoundAfterCopy = (CopyVT != VA.getLocVT());
2574 if (VA.needsCustom()) {
2575 assert(VA.getValVT() == MVT::v64i1 &&
2576 "Currently the only custom case is when we split v64i1 to 2 regs");
2578 getv64i1Argument(VA, RVLocs[++I], Chain, DAG, dl, Subtarget, &InFlag);
2580 Chain = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), CopyVT, InFlag)
2582 Val = Chain.getValue(0);
2583 InFlag = Chain.getValue(2);
2587 Val = DAG.getNode(ISD::FP_ROUND, dl, VA.getValVT(), Val,
2588 // This truncation won't change the value.
2589 DAG.getIntPtrConstant(1, dl));
2591 if (VA.isExtInLoc() && (VA.getValVT().getScalarType() == MVT::i1)) {
2592 if (VA.getValVT().isVector() &&
2593 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
2594 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
2595 // promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
2596 Val = lowerRegToMasks(Val, VA.getValVT(), VA.getLocVT(), dl, DAG);
2598 Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
2601 InVals.push_back(Val);
2607 //===----------------------------------------------------------------------===//
2608 // C & StdCall & Fast Calling Convention implementation
2609 //===----------------------------------------------------------------------===//
2610 // StdCall calling convention seems to be standard for many Windows' API
2611 // routines and around. It differs from C calling convention just a little:
2612 // callee should clean up the stack, not caller. Symbols should be also
2613 // decorated in some fancy way :) It doesn't support any vector arguments.
2614 // For info on fast calling convention see Fast Calling Convention (tail call)
2615 // implementation LowerX86_32FastCCCallTo.
2617 /// CallIsStructReturn - Determines whether a call uses struct return
2619 enum StructReturnType {
2624 static StructReturnType
2625 callIsStructReturn(const SmallVectorImpl<ISD::OutputArg> &Outs, bool IsMCU) {
2627 return NotStructReturn;
2629 const ISD::ArgFlagsTy &Flags = Outs[0].Flags;
2630 if (!Flags.isSRet())
2631 return NotStructReturn;
2632 if (Flags.isInReg() || IsMCU)
2633 return RegStructReturn;
2634 return StackStructReturn;
2637 /// Determines whether a function uses struct return semantics.
2638 static StructReturnType
2639 argsAreStructReturn(const SmallVectorImpl<ISD::InputArg> &Ins, bool IsMCU) {
2641 return NotStructReturn;
2643 const ISD::ArgFlagsTy &Flags = Ins[0].Flags;
2644 if (!Flags.isSRet())
2645 return NotStructReturn;
2646 if (Flags.isInReg() || IsMCU)
2647 return RegStructReturn;
2648 return StackStructReturn;
2651 /// Make a copy of an aggregate at address specified by "Src" to address
2652 /// "Dst" with size and alignment information specified by the specific
2653 /// parameter attribute. The copy will be passed as a byval function parameter.
2654 static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
2655 SDValue Chain, ISD::ArgFlagsTy Flags,
2656 SelectionDAG &DAG, const SDLoc &dl) {
2657 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2659 return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
2660 /*isVolatile*/false, /*AlwaysInline=*/true,
2661 /*isTailCall*/false,
2662 MachinePointerInfo(), MachinePointerInfo());
2665 /// Return true if the calling convention is one that we can guarantee TCO for.
2666 static bool canGuaranteeTCO(CallingConv::ID CC) {
2667 return (CC == CallingConv::Fast || CC == CallingConv::GHC ||
2668 CC == CallingConv::X86_RegCall || CC == CallingConv::HiPE ||
2669 CC == CallingConv::HHVM);
2672 /// Return true if we might ever do TCO for calls with this calling convention.
2673 static bool mayTailCallThisCC(CallingConv::ID CC) {
2675 // C calling conventions:
2676 case CallingConv::C:
2677 case CallingConv::Win64:
2678 case CallingConv::X86_64_SysV:
2679 // Callee pop conventions:
2680 case CallingConv::X86_ThisCall:
2681 case CallingConv::X86_StdCall:
2682 case CallingConv::X86_VectorCall:
2683 case CallingConv::X86_FastCall:
2686 return canGuaranteeTCO(CC);
2690 /// Return true if the function is being made into a tailcall target by
2691 /// changing its ABI.
2692 static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
2693 return GuaranteedTailCallOpt && canGuaranteeTCO(CC);
2696 bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
2698 CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
2699 if (!CI->isTailCall() || Attr.getValueAsString() == "true")
2702 ImmutableCallSite CS(CI);
2703 CallingConv::ID CalleeCC = CS.getCallingConv();
2704 if (!mayTailCallThisCC(CalleeCC))
2711 X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
2712 const SmallVectorImpl<ISD::InputArg> &Ins,
2713 const SDLoc &dl, SelectionDAG &DAG,
2714 const CCValAssign &VA,
2715 MachineFrameInfo &MFI, unsigned i) const {
2716 // Create the nodes corresponding to a load from this parameter slot.
2717 ISD::ArgFlagsTy Flags = Ins[i].Flags;
2718 bool AlwaysUseMutable = shouldGuaranteeTCO(
2719 CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
2720 bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
2722 MVT PtrVT = getPointerTy(DAG.getDataLayout());
2724 // If value is passed by pointer we have address passed instead of the value
2725 // itself. No need to extend if the mask value and location share the same
2727 bool ExtendedInMem =
2728 VA.isExtInLoc() && VA.getValVT().getScalarType() == MVT::i1 &&
2729 VA.getValVT().getSizeInBits() != VA.getLocVT().getSizeInBits();
2731 if (VA.getLocInfo() == CCValAssign::Indirect || ExtendedInMem)
2732 ValVT = VA.getLocVT();
2734 ValVT = VA.getValVT();
2736 // Calculate SP offset of interrupt parameter, re-arrange the slot normally
2737 // taken by a return address.
2739 if (CallConv == CallingConv::X86_INTR) {
2740 // X86 interrupts may take one or two arguments.
2741 // On the stack there will be no return address as in regular call.
2742 // Offset of last argument need to be set to -4/-8 bytes.
2743 // Where offset of the first argument out of two, should be set to 0 bytes.
2744 Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
2745 if (Subtarget.is64Bit() && Ins.size() == 2) {
2746 // The stack pointer needs to be realigned for 64 bit handlers with error
2747 // code, so the argument offset changes by 8 bytes.
2752 // FIXME: For now, all byval parameter objects are marked mutable. This can be
2753 // changed with more analysis.
2754 // In case of tail call optimization mark all arguments mutable. Since they
2755 // could be overwritten by lowering of arguments in case of a tail call.
2756 if (Flags.isByVal()) {
2757 unsigned Bytes = Flags.getByValSize();
2758 if (Bytes == 0) Bytes = 1; // Don't create zero-sized stack objects.
2759 int FI = MFI.CreateFixedObject(Bytes, VA.getLocMemOffset(), isImmutable);
2760 // Adjust SP offset of interrupt parameter.
2761 if (CallConv == CallingConv::X86_INTR) {
2762 MFI.setObjectOffset(FI, Offset);
2764 return DAG.getFrameIndex(FI, PtrVT);
2767 // This is an argument in memory. We might be able to perform copy elision.
2768 if (Flags.isCopyElisionCandidate()) {
2769 EVT ArgVT = Ins[i].ArgVT;
2771 if (Ins[i].PartOffset == 0) {
2772 // If this is a one-part value or the first part of a multi-part value,
2773 // create a stack object for the entire argument value type and return a
2774 // load from our portion of it. This assumes that if the first part of an
2775 // argument is in memory, the rest will also be in memory.
2776 int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
2777 /*Immutable=*/false);
2778 PartAddr = DAG.getFrameIndex(FI, PtrVT);
2780 ValVT, dl, Chain, PartAddr,
2781 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2783 // This is not the first piece of an argument in memory. See if there is
2784 // already a fixed stack object including this offset. If so, assume it
2785 // was created by the PartOffset == 0 branch above and create a load from
2786 // the appropriate offset into it.
2787 int64_t PartBegin = VA.getLocMemOffset();
2788 int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
2789 int FI = MFI.getObjectIndexBegin();
2790 for (; MFI.isFixedObjectIndex(FI); ++FI) {
2791 int64_t ObjBegin = MFI.getObjectOffset(FI);
2792 int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
2793 if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
2796 if (MFI.isFixedObjectIndex(FI)) {
2798 DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
2799 DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
2801 ValVT, dl, Chain, Addr,
2802 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
2803 Ins[i].PartOffset));
2808 int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
2809 VA.getLocMemOffset(), isImmutable);
2811 // Set SExt or ZExt flag.
2812 if (VA.getLocInfo() == CCValAssign::ZExt) {
2813 MFI.setObjectZExt(FI, true);
2814 } else if (VA.getLocInfo() == CCValAssign::SExt) {
2815 MFI.setObjectSExt(FI, true);
2818 // Adjust SP offset of interrupt parameter.
2819 if (CallConv == CallingConv::X86_INTR) {
2820 MFI.setObjectOffset(FI, Offset);
2823 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
2824 SDValue Val = DAG.getLoad(
2825 ValVT, dl, Chain, FIN,
2826 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
2827 return ExtendedInMem
2828 ? (VA.getValVT().isVector()
2829 ? DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VA.getValVT(), Val)
2830 : DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val))
2834 // FIXME: Get this from tablegen.
2835 static ArrayRef<MCPhysReg> get64BitArgumentGPRs(CallingConv::ID CallConv,
2836 const X86Subtarget &Subtarget) {
2837 assert(Subtarget.is64Bit());
2839 if (Subtarget.isCallingConvWin64(CallConv)) {
2840 static const MCPhysReg GPR64ArgRegsWin64[] = {
2841 X86::RCX, X86::RDX, X86::R8, X86::R9
2843 return makeArrayRef(std::begin(GPR64ArgRegsWin64), std::end(GPR64ArgRegsWin64));
2846 static const MCPhysReg GPR64ArgRegs64Bit[] = {
2847 X86::RDI, X86::RSI, X86::RDX, X86::RCX, X86::R8, X86::R9
2849 return makeArrayRef(std::begin(GPR64ArgRegs64Bit), std::end(GPR64ArgRegs64Bit));
2852 // FIXME: Get this from tablegen.
2853 static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
2854 CallingConv::ID CallConv,
2855 const X86Subtarget &Subtarget) {
2856 assert(Subtarget.is64Bit());
2857 if (Subtarget.isCallingConvWin64(CallConv)) {
2858 // The XMM registers which might contain var arg parameters are shadowed
2859 // in their paired GPR. So we only need to save the GPR to their home
2861 // TODO: __vectorcall will change this.
2865 const Function *Fn = MF.getFunction();
2866 bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
2867 bool isSoftFloat = Subtarget.useSoftFloat();
2868 assert(!(isSoftFloat && NoImplicitFloatOps) &&
2869 "SSE register cannot be used when SSE is disabled!");
2870 if (isSoftFloat || NoImplicitFloatOps || !Subtarget.hasSSE1())
2871 // Kernel mode asks for SSE to be disabled, so there are no XMM argument
2875 static const MCPhysReg XMMArgRegs64Bit[] = {
2876 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
2877 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
2879 return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
2883 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
2884 return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
2885 [](const CCValAssign &A, const CCValAssign &B) -> bool {
2886 return A.getValNo() < B.getValNo();
2891 SDValue X86TargetLowering::LowerFormalArguments(
2892 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
2893 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
2894 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
2895 MachineFunction &MF = DAG.getMachineFunction();
2896 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
2897 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
2899 const Function *Fn = MF.getFunction();
2900 if (Fn->hasExternalLinkage() &&
2901 Subtarget.isTargetCygMing() &&
2902 Fn->getName() == "main")
2903 FuncInfo->setForceFramePointer(true);
2905 MachineFrameInfo &MFI = MF.getFrameInfo();
2906 bool Is64Bit = Subtarget.is64Bit();
2907 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
2910 !(isVarArg && canGuaranteeTCO(CallConv)) &&
2911 "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
2913 if (CallConv == CallingConv::X86_INTR) {
2914 bool isLegal = Ins.size() == 1 ||
2915 (Ins.size() == 2 && ((Is64Bit && Ins[1].VT == MVT::i64) ||
2916 (!Is64Bit && Ins[1].VT == MVT::i32)));
2918 report_fatal_error("X86 interrupts may take one or two arguments");
2921 // Assign locations to all of the incoming arguments.
2922 SmallVector<CCValAssign, 16> ArgLocs;
2923 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
2925 // Allocate shadow area for Win64.
2927 CCInfo.AllocateStack(32, 8);
2929 CCInfo.AnalyzeArguments(Ins, CC_X86);
2931 // In vectorcall calling convention a second pass is required for the HVA
2933 if (CallingConv::X86_VectorCall == CallConv) {
2934 CCInfo.AnalyzeArgumentsSecondPass(Ins, CC_X86);
2937 // The next loop assumes that the locations are in the same order of the
2939 assert(isSortedByValueNo(ArgLocs) &&
2940 "Argument Location list must be sorted before lowering");
2943 for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
2945 assert(InsIndex < Ins.size() && "Invalid Ins index");
2946 CCValAssign &VA = ArgLocs[I];
2948 if (VA.isRegLoc()) {
2949 EVT RegVT = VA.getLocVT();
2950 if (VA.needsCustom()) {
2952 VA.getValVT() == MVT::v64i1 &&
2953 "Currently the only custom case is when we split v64i1 to 2 regs");
2955 // v64i1 values, in regcall calling convention, that are
2956 // compiled to 32 bit arch, are split up into two registers.
2958 getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
2960 const TargetRegisterClass *RC;
2961 if (RegVT == MVT::i32)
2962 RC = &X86::GR32RegClass;
2963 else if (Is64Bit && RegVT == MVT::i64)
2964 RC = &X86::GR64RegClass;
2965 else if (RegVT == MVT::f32)
2966 RC = Subtarget.hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
2967 else if (RegVT == MVT::f64)
2968 RC = Subtarget.hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
2969 else if (RegVT == MVT::f80)
2970 RC = &X86::RFP80RegClass;
2971 else if (RegVT == MVT::f128)
2972 RC = &X86::FR128RegClass;
2973 else if (RegVT.is512BitVector())
2974 RC = &X86::VR512RegClass;
2975 else if (RegVT.is256BitVector())
2976 RC = Subtarget.hasVLX() ? &X86::VR256XRegClass : &X86::VR256RegClass;
2977 else if (RegVT.is128BitVector())
2978 RC = Subtarget.hasVLX() ? &X86::VR128XRegClass : &X86::VR128RegClass;
2979 else if (RegVT == MVT::x86mmx)
2980 RC = &X86::VR64RegClass;
2981 else if (RegVT == MVT::v1i1)
2982 RC = &X86::VK1RegClass;
2983 else if (RegVT == MVT::v8i1)
2984 RC = &X86::VK8RegClass;
2985 else if (RegVT == MVT::v16i1)
2986 RC = &X86::VK16RegClass;
2987 else if (RegVT == MVT::v32i1)
2988 RC = &X86::VK32RegClass;
2989 else if (RegVT == MVT::v64i1)
2990 RC = &X86::VK64RegClass;
2992 llvm_unreachable("Unknown argument type!");
2994 unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
2995 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
2998 // If this is an 8 or 16-bit value, it is really passed promoted to 32
2999 // bits. Insert an assert[sz]ext to capture this, then truncate to the
3001 if (VA.getLocInfo() == CCValAssign::SExt)
3002 ArgValue = DAG.getNode(ISD::AssertSext, dl, RegVT, ArgValue,
3003 DAG.getValueType(VA.getValVT()));
3004 else if (VA.getLocInfo() == CCValAssign::ZExt)
3005 ArgValue = DAG.getNode(ISD::AssertZext, dl, RegVT, ArgValue,
3006 DAG.getValueType(VA.getValVT()));
3007 else if (VA.getLocInfo() == CCValAssign::BCvt)
3008 ArgValue = DAG.getBitcast(VA.getValVT(), ArgValue);
3010 if (VA.isExtInLoc()) {
3011 // Handle MMX values passed in XMM regs.
3012 if (RegVT.isVector() && VA.getValVT().getScalarType() != MVT::i1)
3013 ArgValue = DAG.getNode(X86ISD::MOVDQ2Q, dl, VA.getValVT(), ArgValue);
3014 else if (VA.getValVT().isVector() &&
3015 VA.getValVT().getScalarType() == MVT::i1 &&
3016 ((VA.getLocVT() == MVT::i64) || (VA.getLocVT() == MVT::i32) ||
3017 (VA.getLocVT() == MVT::i16) || (VA.getLocVT() == MVT::i8))) {
3018 // Promoting a mask type (v*i1) into a register of type i64/i32/i16/i8
3019 ArgValue = lowerRegToMasks(ArgValue, VA.getValVT(), RegVT, dl, DAG);
3021 ArgValue = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), ArgValue);
3024 assert(VA.isMemLoc());
3026 LowerMemArgument(Chain, CallConv, Ins, dl, DAG, VA, MFI, InsIndex);
3029 // If value is passed via pointer - do a load.
3030 if (VA.getLocInfo() == CCValAssign::Indirect)
3032 DAG.getLoad(VA.getValVT(), dl, Chain, ArgValue, MachinePointerInfo());
3034 InVals.push_back(ArgValue);
3037 for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
3038 // Swift calling convention does not require we copy the sret argument
3039 // into %rax/%eax for the return. We don't set SRetReturnReg for Swift.
3040 if (CallConv == CallingConv::Swift)
3043 // All x86 ABIs require that for returning structs by value we copy the
3044 // sret argument into %rax/%eax (depending on ABI) for the return. Save
3045 // the argument into a virtual register so that we can access it from the
3047 if (Ins[I].Flags.isSRet()) {
3048 unsigned Reg = FuncInfo->getSRetReturnReg();
3050 MVT PtrTy = getPointerTy(DAG.getDataLayout());
3051 Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
3052 FuncInfo->setSRetReturnReg(Reg);
3054 SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[I]);
3055 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
3060 unsigned StackSize = CCInfo.getNextStackOffset();
3061 // Align stack specially for tail calls.
3062 if (shouldGuaranteeTCO(CallConv,
3063 MF.getTarget().Options.GuaranteedTailCallOpt))
3064 StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
3066 // If the function takes variable number of arguments, make a frame index for
3067 // the start of the first vararg value... for expansion of llvm.va_start. We
3068 // can skip this if there are no va_start calls.
3069 if (MFI.hasVAStart() &&
3070 (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
3071 CallConv != CallingConv::X86_ThisCall))) {
3072 FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
3075 // Figure out if XMM registers are in use.
3076 assert(!(Subtarget.useSoftFloat() &&
3077 Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
3078 "SSE register cannot be used when SSE is disabled!");
3080 // 64-bit calling conventions support varargs and register parameters, so we
3081 // have to do extra work to spill them in the prologue.
3082 if (Is64Bit && isVarArg && MFI.hasVAStart()) {
3083 // Find the first unallocated argument registers.
3084 ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
3085 ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
3086 unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
3087 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
3088 assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
3089 "SSE register cannot be used when SSE is disabled!");
3091 // Gather all the live in physical registers.
3092 SmallVector<SDValue, 6> LiveGPRs;
3093 SmallVector<SDValue, 8> LiveXMMRegs;
3095 for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
3096 unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
3098 DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
3100 if (!ArgXMMs.empty()) {
3101 unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3102 ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
3103 for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
3104 unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
3105 LiveXMMRegs.push_back(
3106 DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
3111 // Get to the caller-allocated home save location. Add 8 to account
3112 // for the return address.
3113 int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
3114 FuncInfo->setRegSaveFrameIndex(
3115 MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
3116 // Fixup to set vararg frame on shadow area (4 x i64).
3118 FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
3120 // For X86-64, if there are vararg parameters that are passed via
3121 // registers, then we must store them to their spots on the stack so
3122 // they may be loaded by dereferencing the result of va_next.
3123 FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
3124 FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
3125 FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
3126 ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
3129 // Store the integer parameter registers.
3130 SmallVector<SDValue, 8> MemOps;
3131 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
3132 getPointerTy(DAG.getDataLayout()));
3133 unsigned Offset = FuncInfo->getVarArgsGPOffset();
3134 for (SDValue Val : LiveGPRs) {
3135 SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3136 RSFIN, DAG.getIntPtrConstant(Offset, dl));
3138 DAG.getStore(Val.getValue(1), dl, Val, FIN,
3139 MachinePointerInfo::getFixedStack(
3140 DAG.getMachineFunction(),
3141 FuncInfo->getRegSaveFrameIndex(), Offset));
3142 MemOps.push_back(Store);
3146 if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
3147 // Now store the XMM (fp + vector) parameter registers.
3148 SmallVector<SDValue, 12> SaveXMMOps;
3149 SaveXMMOps.push_back(Chain);
3150 SaveXMMOps.push_back(ALVal);
3151 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3152 FuncInfo->getRegSaveFrameIndex(), dl));
3153 SaveXMMOps.push_back(DAG.getIntPtrConstant(
3154 FuncInfo->getVarArgsFPOffset(), dl));
3155 SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
3157 MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
3158 MVT::Other, SaveXMMOps));
3161 if (!MemOps.empty())
3162 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
3165 if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
3166 // Find the largest legal vector type.
3167 MVT VecVT = MVT::Other;
3168 // FIXME: Only some x86_32 calling conventions support AVX512.
3169 if (Subtarget.hasAVX512() &&
3170 (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
3171 CallConv == CallingConv::Intel_OCL_BI)))
3172 VecVT = MVT::v16f32;
3173 else if (Subtarget.hasAVX())
3175 else if (Subtarget.hasSSE2())
3178 // We forward some GPRs and some vector types.
3179 SmallVector<MVT, 2> RegParmTypes;
3180 MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
3181 RegParmTypes.push_back(IntVT);
3182 if (VecVT != MVT::Other)
3183 RegParmTypes.push_back(VecVT);
3185 // Compute the set of forwarded registers. The rest are scratch.
3186 SmallVectorImpl<ForwardedRegister> &Forwards =
3187 FuncInfo->getForwardedMustTailRegParms();
3188 CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
3190 // Conservatively forward AL on x86_64, since it might be used for varargs.
3191 if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
3192 unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
3193 Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
3196 // Copy all forwards from physical to virtual registers.
3197 for (ForwardedRegister &F : Forwards) {
3198 // FIXME: Can we use a less constrained schedule?
3199 SDValue RegVal = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3200 F.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(F.VT));
3201 Chain = DAG.getCopyToReg(Chain, dl, F.VReg, RegVal);
3205 // Some CCs need callee pop.
3206 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3207 MF.getTarget().Options.GuaranteedTailCallOpt)) {
3208 FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
3209 } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
3210 // X86 interrupts must pop the error code (and the alignment padding) if
3212 FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
3214 FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
3215 // If this is an sret function, the return should pop the hidden pointer.
3216 if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3217 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3218 argsAreStructReturn(Ins, Subtarget.isTargetMCU()) == StackStructReturn)
3219 FuncInfo->setBytesToPopOnReturn(4);
3223 // RegSaveFrameIndex is X86-64 only.
3224 FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
3225 if (CallConv == CallingConv::X86_FastCall ||
3226 CallConv == CallingConv::X86_ThisCall)
3227 // fastcc functions can't have varargs.
3228 FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
3231 FuncInfo->setArgumentStackSize(StackSize);
3233 if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
3234 EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
3235 if (Personality == EHPersonality::CoreCLR) {
3237 // TODO: Add a mechanism to frame lowering that will allow us to indicate
3238 // that we'd prefer this slot be allocated towards the bottom of the frame
3239 // (i.e. near the stack pointer after allocating the frame). Every
3240 // funclet needs a copy of this slot in its (mostly empty) frame, and the
3241 // offset from the bottom of this and each funclet's frame must be the
3242 // same, so the size of funclets' (mostly empty) frames is dictated by
3243 // how far this slot is from the bottom (since they allocate just enough
3244 // space to accommodate holding this slot at the correct offset).
3245 int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
3246 EHInfo->PSPSymFrameIdx = PSPSymFI;
3250 if (CallConv == CallingConv::X86_RegCall ||
3251 Fn->hasFnAttribute("no_caller_saved_registers")) {
3252 const MachineRegisterInfo &MRI = MF.getRegInfo();
3253 for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
3254 MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
3260 SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
3261 SDValue Arg, const SDLoc &dl,
3263 const CCValAssign &VA,
3264 ISD::ArgFlagsTy Flags) const {
3265 unsigned LocMemOffset = VA.getLocMemOffset();
3266 SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
3267 PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3269 if (Flags.isByVal())
3270 return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
3272 return DAG.getStore(
3273 Chain, dl, Arg, PtrOff,
3274 MachinePointerInfo::getStack(DAG.getMachineFunction(), LocMemOffset));
3277 /// Emit a load of return address if tail call
3278 /// optimization is performed and it is required.
3279 SDValue X86TargetLowering::EmitTailCallLoadRetAddr(
3280 SelectionDAG &DAG, SDValue &OutRetAddr, SDValue Chain, bool IsTailCall,
3281 bool Is64Bit, int FPDiff, const SDLoc &dl) const {
3282 // Adjust the Return address stack slot.
3283 EVT VT = getPointerTy(DAG.getDataLayout());
3284 OutRetAddr = getReturnAddressFrameIndex(DAG);
3286 // Load the "old" Return address.
3287 OutRetAddr = DAG.getLoad(VT, dl, Chain, OutRetAddr, MachinePointerInfo());
3288 return SDValue(OutRetAddr.getNode(), 1);
3291 /// Emit a store of the return address if tail call
3292 /// optimization is performed and it is required (FPDiff!=0).
3293 static SDValue EmitTailCallStoreRetAddr(SelectionDAG &DAG, MachineFunction &MF,
3294 SDValue Chain, SDValue RetAddrFrIdx,
3295 EVT PtrVT, unsigned SlotSize,
3296 int FPDiff, const SDLoc &dl) {
3297 // Store the return address to the appropriate stack slot.
3298 if (!FPDiff) return Chain;
3299 // Calculate the new stack slot for the return address.
3300 int NewReturnAddrFI =
3301 MF.getFrameInfo().CreateFixedObject(SlotSize, (int64_t)FPDiff - SlotSize,
3303 SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
3304 Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
3305 MachinePointerInfo::getFixedStack(
3306 DAG.getMachineFunction(), NewReturnAddrFI));
3310 /// Returns a vector_shuffle mask for an movs{s|d}, movd
3311 /// operation of specified width.
3312 static SDValue getMOVL(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1,
3314 unsigned NumElems = VT.getVectorNumElements();
3315 SmallVector<int, 8> Mask;
3316 Mask.push_back(NumElems);
3317 for (unsigned i = 1; i != NumElems; ++i)
3319 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
3323 X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
3324 SmallVectorImpl<SDValue> &InVals) const {
3325 SelectionDAG &DAG = CLI.DAG;
3327 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
3328 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
3329 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
3330 SDValue Chain = CLI.Chain;
3331 SDValue Callee = CLI.Callee;
3332 CallingConv::ID CallConv = CLI.CallConv;
3333 bool &isTailCall = CLI.IsTailCall;
3334 bool isVarArg = CLI.IsVarArg;
3336 MachineFunction &MF = DAG.getMachineFunction();
3337 bool Is64Bit = Subtarget.is64Bit();
3338 bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
3339 StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
3340 bool IsSibcall = false;
3341 X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
3342 auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
3343 const CallInst *CI =
3344 CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
3345 const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
3346 bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
3347 (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
3349 if (CallConv == CallingConv::X86_INTR)
3350 report_fatal_error("X86 interrupts may not be called directly");
3352 if (Attr.getValueAsString() == "true")
3355 if (Subtarget.isPICStyleGOT() &&
3356 !MF.getTarget().Options.GuaranteedTailCallOpt) {
3357 // If we are using a GOT, disable tail calls to external symbols with
3358 // default visibility. Tail calling such a symbol requires using a GOT
3359 // relocation, which forces early binding of the symbol. This breaks code
3360 // that require lazy function symbol resolution. Using musttail or
3361 // GuaranteedTailCallOpt will override this.
3362 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3363 if (!G || (!G->getGlobal()->hasLocalLinkage() &&
3364 G->getGlobal()->hasDefaultVisibility()))
3368 bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
3370 // Force this to be a tail call. The verifier rules are enough to ensure
3371 // that we can lower this successfully without moving the return address
3374 } else if (isTailCall) {
3375 // Check if it's really possible to do a tail call.
3376 isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
3377 isVarArg, SR != NotStructReturn,
3378 MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
3379 Outs, OutVals, Ins, DAG);
3381 // Sibcalls are automatically detected tailcalls which do not require
3383 if (!MF.getTarget().Options.GuaranteedTailCallOpt && isTailCall)
3390 assert(!(isVarArg && canGuaranteeTCO(CallConv)) &&
3391 "Var args not supported with calling convention fastcc, ghc or hipe");
3393 // Analyze operands of the call, assigning locations to each operand.
3394 SmallVector<CCValAssign, 16> ArgLocs;
3395 CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
3397 // Allocate shadow area for Win64.
3399 CCInfo.AllocateStack(32, 8);
3401 CCInfo.AnalyzeArguments(Outs, CC_X86);
3403 // In vectorcall calling convention a second pass is required for the HVA
3405 if (CallingConv::X86_VectorCall == CallConv) {
3406 CCInfo.AnalyzeArgumentsSecondPass(Outs, CC_X86);
3409 // Get a count of how many bytes are to be pushed on the stack.
3410 unsigned NumBytes = CCInfo.getAlignedCallFrameSize();
3412 // This is a sibcall. The memory operands are available in caller's
3413 // own caller's stack.
3415 else if (MF.getTarget().Options.GuaranteedTailCallOpt &&
3416 canGuaranteeTCO(CallConv))
3417 NumBytes = GetAlignedArgumentStackSize(NumBytes, DAG);
3420 if (isTailCall && !IsSibcall && !IsMustTail) {
3421 // Lower arguments at fp - stackoffset + fpdiff.
3422 unsigned NumBytesCallerPushed = X86Info->getBytesToPopOnReturn();
3424 FPDiff = NumBytesCallerPushed - NumBytes;
3426 // Set the delta of movement of the returnaddr stackslot.
3427 // But only set if delta is greater than previous delta.
3428 if (FPDiff < X86Info->getTCReturnAddrDelta())
3429 X86Info->setTCReturnAddrDelta(FPDiff);
3432 unsigned NumBytesToPush = NumBytes;
3433 unsigned NumBytesToPop = NumBytes;
3435 // If we have an inalloca argument, all stack space has already been allocated
3436 // for us and be right at the top of the stack. We don't support multiple
3437 // arguments passed in memory when using inalloca.
3438 if (!Outs.empty() && Outs.back().Flags.isInAlloca()) {
3440 if (!ArgLocs.back().isMemLoc())
3441 report_fatal_error("cannot use inalloca attribute on a register "
3443 if (ArgLocs.back().getLocMemOffset() != 0)
3444 report_fatal_error("any parameter with the inalloca attribute must be "
3445 "the only memory argument");
3449 Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
3450 NumBytes - NumBytesToPush, dl);
3452 SDValue RetAddrFrIdx;
3453 // Load return address for tail calls.
3454 if (isTailCall && FPDiff)
3455 Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
3456 Is64Bit, FPDiff, dl);
3458 SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
3459 SmallVector<SDValue, 8> MemOpChains;
3462 // The next loop assumes that the locations are in the same order of the
3464 assert(isSortedByValueNo(ArgLocs) &&
3465 "Argument Location list must be sorted before lowering");
3467 // Walk the register/memloc assignments, inserting copies/loads. In the case
3468 // of tail call optimization arguments are handle later.
3469 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3470 for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
3472 assert(OutIndex < Outs.size() && "Invalid Out index");
3473 // Skip inalloca arguments, they have already been written.
3474 ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
3475 if (Flags.isInAlloca())
3478 CCValAssign &VA = ArgLocs[I];
3479 EVT RegVT = VA.getLocVT();
3480 SDValue Arg = OutVals[OutIndex];
3481 bool isByVal = Flags.isByVal();
3483 // Promote the value if needed.
3484 switch (VA.getLocInfo()) {
3485 default: llvm_unreachable("Unknown loc info!");
3486 case CCValAssign::Full: break;
3487 case CCValAssign::SExt:
3488 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, RegVT, Arg);
3490 case CCValAssign::ZExt:
3491 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, RegVT, Arg);
3493 case CCValAssign::AExt:
3494 if (Arg.getValueType().isVector() &&
3495 Arg.getValueType().getVectorElementType() == MVT::i1)
3496 Arg = lowerMasksToReg(Arg, RegVT, dl, DAG);
3497 else if (RegVT.is128BitVector()) {
3498 // Special case: passing MMX values in XMM registers.
3499 Arg = DAG.getBitcast(MVT::i64, Arg);
3500 Arg = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Arg);
3501 Arg = getMOVL(DAG, dl, MVT::v2i64, DAG.getUNDEF(MVT::v2i64), Arg);
3503 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, RegVT, Arg);
3505 case CCValAssign::BCvt:
3506 Arg = DAG.getBitcast(RegVT, Arg);
3508 case CCValAssign::Indirect: {
3509 // Store the argument.
3510 SDValue SpillSlot = DAG.CreateStackTemporary(VA.getValVT());
3511 int FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
3512 Chain = DAG.getStore(
3513 Chain, dl, Arg, SpillSlot,
3514 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
3520 if (VA.needsCustom()) {
3521 assert(VA.getValVT() == MVT::v64i1 &&
3522 "Currently the only custom case is when we split v64i1 to 2 regs");
3523 // Split v64i1 value into two registers
3524 Passv64i1ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++I],
3526 } else if (VA.isRegLoc()) {
3527 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
3528 if (isVarArg && IsWin64) {
3529 // Win64 ABI requires argument XMM reg to be copied to the corresponding
3530 // shadow reg if callee is a varargs function.
3531 unsigned ShadowReg = 0;
3532 switch (VA.getLocReg()) {
3533 case X86::XMM0: ShadowReg = X86::RCX; break;
3534 case X86::XMM1: ShadowReg = X86::RDX; break;
3535 case X86::XMM2: ShadowReg = X86::R8; break;
3536 case X86::XMM3: ShadowReg = X86::R9; break;
3539 RegsToPass.push_back(std::make_pair(ShadowReg, Arg));
3541 } else if (!IsSibcall && (!isTailCall || isByVal)) {
3542 assert(VA.isMemLoc());
3543 if (!StackPtr.getNode())
3544 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3545 getPointerTy(DAG.getDataLayout()));
3546 MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
3547 dl, DAG, VA, Flags));
3551 if (!MemOpChains.empty())
3552 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
3554 if (Subtarget.isPICStyleGOT()) {
3555 // ELF / PIC requires GOT in the EBX register before function calls via PLT
3558 RegsToPass.push_back(std::make_pair(
3559 unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
3560 getPointerTy(DAG.getDataLayout()))));
3562 // If we are tail calling and generating PIC/GOT style code load the
3563 // address of the callee into ECX. The value in ecx is used as target of
3564 // the tail jump. This is done to circumvent the ebx/callee-saved problem
3565 // for tail calls on PIC/GOT architectures. Normally we would just put the
3566 // address of GOT into ebx and then call target@PLT. But for tail calls
3567 // ebx would be restored (since ebx is callee saved) before jumping to the
3570 // Note: The actual moving to ECX is done further down.
3571 GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
3572 if (G && !G->getGlobal()->hasLocalLinkage() &&
3573 G->getGlobal()->hasDefaultVisibility())
3574 Callee = LowerGlobalAddress(Callee, DAG);
3575 else if (isa<ExternalSymbolSDNode>(Callee))
3576 Callee = LowerExternalSymbol(Callee, DAG);
3580 if (Is64Bit && isVarArg && !IsWin64 && !IsMustTail) {
3581 // From AMD64 ABI document:
3582 // For calls that may call functions that use varargs or stdargs
3583 // (prototype-less calls or calls to functions containing ellipsis (...) in
3584 // the declaration) %al is used as hidden argument to specify the number
3585 // of SSE registers used. The contents of %al do not need to match exactly
3586 // the number of registers, but must be an ubound on the number of SSE
3587 // registers used and is in the range 0 - 8 inclusive.
3589 // Count the number of XMM registers allocated.
3590 static const MCPhysReg XMMArgRegs[] = {
3591 X86::XMM0, X86::XMM1, X86::XMM2, X86::XMM3,
3592 X86::XMM4, X86::XMM5, X86::XMM6, X86::XMM7
3594 unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
3595 assert((Subtarget.hasSSE1() || !NumXMMRegs)
3596 && "SSE registers cannot be used when SSE is disabled");
3598 RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
3599 DAG.getConstant(NumXMMRegs, dl,
3603 if (isVarArg && IsMustTail) {
3604 const auto &Forwards = X86Info->getForwardedMustTailRegParms();
3605 for (const auto &F : Forwards) {
3606 SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
3607 RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
3611 // For tail calls lower the arguments to the 'real' stack slots. Sibcalls
3612 // don't need this because the eligibility check rejects calls that require
3613 // shuffling arguments passed in memory.
3614 if (!IsSibcall && isTailCall) {
3615 // Force all the incoming stack arguments to be loaded from the stack
3616 // before any new outgoing arguments are stored to the stack, because the
3617 // outgoing stack slots may alias the incoming argument stack slots, and
3618 // the alias isn't otherwise explicit. This is slightly more conservative
3619 // than necessary, because it means that each store effectively depends
3620 // on every argument instead of just those arguments it would clobber.
3621 SDValue ArgChain = DAG.getStackArgumentTokenFactor(Chain);
3623 SmallVector<SDValue, 8> MemOpChains2;
3626 for (unsigned I = 0, OutsIndex = 0, E = ArgLocs.size(); I != E;
3628 CCValAssign &VA = ArgLocs[I];
3630 if (VA.isRegLoc()) {
3631 if (VA.needsCustom()) {
3632 assert((CallConv == CallingConv::X86_RegCall) &&
3633 "Expecting custom case only in regcall calling convention");
3634 // This means that we are in special case where one argument was
3635 // passed through two register locations - Skip the next location
3642 assert(VA.isMemLoc());
3643 SDValue Arg = OutVals[OutsIndex];
3644 ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
3645 // Skip inalloca arguments. They don't require any work.
3646 if (Flags.isInAlloca())
3648 // Create frame index.
3649 int32_t Offset = VA.getLocMemOffset()+FPDiff;
3650 uint32_t OpSize = (VA.getLocVT().getSizeInBits()+7)/8;
3651 FI = MF.getFrameInfo().CreateFixedObject(OpSize, Offset, true);
3652 FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
3654 if (Flags.isByVal()) {
3655 // Copy relative to framepointer.
3656 SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset(), dl);
3657 if (!StackPtr.getNode())
3658 StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
3659 getPointerTy(DAG.getDataLayout()));
3660 Source = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
3663 MemOpChains2.push_back(CreateCopyOfByValArgument(Source, FIN,
3667 // Store relative to framepointer.
3668 MemOpChains2.push_back(DAG.getStore(
3669 ArgChain, dl, Arg, FIN,
3670 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI)));
3674 if (!MemOpChains2.empty())
3675 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains2);
3677 // Store the return address to the appropriate stack slot.
3678 Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
3679 getPointerTy(DAG.getDataLayout()),
3680 RegInfo->getSlotSize(), FPDiff, dl);
3683 // Build a sequence of copy-to-reg nodes chained together with token chain
3684 // and flag operands which copy the outgoing args into registers.
3686 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
3687 Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
3688 RegsToPass[i].second, InFlag);
3689 InFlag = Chain.getValue(1);
3692 if (DAG.getTarget().getCodeModel() == CodeModel::Large) {
3693 assert(Is64Bit && "Large code model is only legal in 64-bit mode.");
3694 // In the 64-bit large code model, we have to make all calls
3695 // through a register, since the call instruction's 32-bit
3696 // pc-relative offset may not be large enough to hold the whole
3698 } else if (Callee->getOpcode() == ISD::GlobalAddress) {
3699 // If the callee is a GlobalAddress node (quite common, every direct call
3700 // is) turn it into a TargetGlobalAddress node so that legalize doesn't hack
3702 GlobalAddressSDNode* G = cast<GlobalAddressSDNode>(Callee);
3704 // We should use extra load for direct calls to dllimported functions in
3706 const GlobalValue *GV = G->getGlobal();
3707 if (!GV->hasDLLImportStorageClass()) {
3708 unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(GV);
3710 Callee = DAG.getTargetGlobalAddress(
3711 GV, dl, getPointerTy(DAG.getDataLayout()), G->getOffset(), OpFlags);
3713 if (OpFlags == X86II::MO_GOTPCREL) {
3715 Callee = DAG.getNode(X86ISD::WrapperRIP, dl,
3716 getPointerTy(DAG.getDataLayout()), Callee);
3717 // Add extra indirection
3718 Callee = DAG.getLoad(
3719 getPointerTy(DAG.getDataLayout()), dl, DAG.getEntryNode(), Callee,
3720 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
3723 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
3724 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
3725 unsigned char OpFlags =
3726 Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
3728 Callee = DAG.getTargetExternalSymbol(
3729 S->getSymbol(), getPointerTy(DAG.getDataLayout()), OpFlags);
3730 } else if (Subtarget.isTarget64BitILP32() &&
3731 Callee->getValueType(0) == MVT::i32) {
3732 // Zero-extend the 32-bit Callee address into a 64-bit according to x32 ABI
3733 Callee = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Callee);
3736 // Returns a chain & a flag for retval copy to use.
3737 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
3738 SmallVector<SDValue, 8> Ops;
3740 if (!IsSibcall && isTailCall) {
3741 Chain = DAG.getCALLSEQ_END(Chain,
3742 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3743 DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
3744 InFlag = Chain.getValue(1);
3747 Ops.push_back(Chain);
3748 Ops.push_back(Callee);
3751 Ops.push_back(DAG.getConstant(FPDiff, dl, MVT::i32));
3753 // Add argument registers to the end of the list so that they are known live
3755 for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
3756 Ops.push_back(DAG.getRegister(RegsToPass[i].first,
3757 RegsToPass[i].second.getValueType()));
3759 // Add a register mask operand representing the call-preserved registers.
3760 // If HasNCSR is asserted (attribute NoCallerSavedRegisters exists) then we
3761 // set X86_INTR calling convention because it has the same CSR mask
3762 // (same preserved registers).
3763 const uint32_t *Mask = RegInfo->getCallPreservedMask(
3764 MF, HasNCSR ? (CallingConv::ID)CallingConv::X86_INTR : CallConv);
3765 assert(Mask && "Missing call preserved mask for calling convention");
3767 // If this is an invoke in a 32-bit function using a funclet-based
3768 // personality, assume the function clobbers all registers. If an exception
3769 // is thrown, the runtime will not restore CSRs.
3770 // FIXME: Model this more precisely so that we can register allocate across
3771 // the normal edge and spill and fill across the exceptional edge.
3772 if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
3773 const Function *CallerFn = MF.getFunction();
3774 EHPersonality Pers =
3775 CallerFn->hasPersonalityFn()
3776 ? classifyEHPersonality(CallerFn->getPersonalityFn())
3777 : EHPersonality::Unknown;
3778 if (isFuncletEHPersonality(Pers))
3779 Mask = RegInfo->getNoPreservedMask();
3782 // Define a new register mask from the existing mask.
3783 uint32_t *RegMask = nullptr;
3785 // In some calling conventions we need to remove the used physical registers
3786 // from the reg mask.
3787 if (CallConv == CallingConv::X86_RegCall || HasNCSR) {
3788 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
3790 // Allocate a new Reg Mask and copy Mask.
3791 RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
3792 unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
3793 memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
3795 // Make sure all sub registers of the argument registers are reset
3797 for (auto const &RegPair : RegsToPass)
3798 for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
3799 SubRegs.isValid(); ++SubRegs)
3800 RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
3802 // Create the RegMask Operand according to our updated mask.
3803 Ops.push_back(DAG.getRegisterMask(RegMask));
3805 // Create the RegMask Operand according to the static mask.
3806 Ops.push_back(DAG.getRegisterMask(Mask));
3809 if (InFlag.getNode())
3810 Ops.push_back(InFlag);
3814 //// If this is the first return lowered for this function, add the regs
3815 //// to the liveout set for the function.
3816 // This isn't right, although it's probably harmless on x86; liveouts
3817 // should be computed from returns not tail calls. Consider a void
3818 // function making a tail call to a function returning int.
3819 MF.getFrameInfo().setHasTailCall();
3820 return DAG.getNode(X86ISD::TC_RETURN, dl, NodeTys, Ops);
3823 Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
3824 InFlag = Chain.getValue(1);
3826 // Create the CALLSEQ_END node.
3827 unsigned NumBytesForCalleeToPop;
3828 if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
3829 DAG.getTarget().Options.GuaranteedTailCallOpt))
3830 NumBytesForCalleeToPop = NumBytes; // Callee pops everything
3831 else if (!Is64Bit && !canGuaranteeTCO(CallConv) &&
3832 !Subtarget.getTargetTriple().isOSMSVCRT() &&
3833 SR == StackStructReturn)
3834 // If this is a call to a struct-return function, the callee
3835 // pops the hidden struct pointer, so we have to push it back.
3836 // This is common for Darwin/X86, Linux & Mingw32 targets.
3837 // For MSVC Win32 targets, the caller pops the hidden struct pointer.
3838 NumBytesForCalleeToPop = 4;
3840 NumBytesForCalleeToPop = 0; // Callee pops nothing.
3842 if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
3843 // No need to reset the stack after the call if the call doesn't return. To
3844 // make the MI verify, we'll pretend the callee does it for us.
3845 NumBytesForCalleeToPop = NumBytes;
3848 // Returns a flag for retval copy to use.
3850 Chain = DAG.getCALLSEQ_END(Chain,
3851 DAG.getIntPtrConstant(NumBytesToPop, dl, true),
3852 DAG.getIntPtrConstant(NumBytesForCalleeToPop, dl,
3855 InFlag = Chain.getValue(1);
3858 // Handle result values, copying them out of physregs into vregs that we
3860 return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
3864 //===----------------------------------------------------------------------===//
3865 // Fast Calling Convention (tail call) implementation
3866 //===----------------------------------------------------------------------===//
3868 // Like std call, callee cleans arguments, convention except that ECX is
3869 // reserved for storing the tail called function address. Only 2 registers are
3870 // free for argument passing (inreg). Tail call optimization is performed
3872 // * tailcallopt is enabled
3873 // * caller/callee are fastcc
3874 // On X86_64 architecture with GOT-style position independent code only local
3875 // (within module) calls are supported at the moment.
3876 // To keep the stack aligned according to platform abi the function
3877 // GetAlignedArgumentStackSize ensures that argument delta is always multiples
3878 // of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
3879 // If a tail called function callee has more arguments than the caller the
3880 // caller needs to make sure that there is room to move the RETADDR to. This is
3881 // achieved by reserving an area the size of the argument delta right after the
3882 // original RETADDR, but before the saved framepointer or the spilled registers
3883 // e.g. caller(arg1, arg2) calls callee(arg1, arg2,arg3,arg4)
3895 /// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
3898 X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
3899 SelectionDAG& DAG) const {
3900 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
3901 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
3902 unsigned StackAlignment = TFI.getStackAlignment();
3903 uint64_t AlignMask = StackAlignment - 1;
3904 int64_t Offset = StackSize;
3905 unsigned SlotSize = RegInfo->getSlotSize();
3906 if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
3907 // Number smaller than 12 so just add the difference.
3908 Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
3910 // Mask out lower bits, add stackalignment once plus the 12 bytes.
3911 Offset = ((~AlignMask) & Offset) + StackAlignment +
3912 (StackAlignment-SlotSize);
3917 /// Return true if the given stack call argument is already available in the
3918 /// same position (relatively) of the caller's incoming argument stack.
3920 bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
3921 MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
3922 const X86InstrInfo *TII, const CCValAssign &VA) {
3923 unsigned Bytes = Arg.getValueSizeInBits() / 8;
3926 // Look through nodes that don't alter the bits of the incoming value.
3927 unsigned Op = Arg.getOpcode();
3928 if (Op == ISD::ZERO_EXTEND || Op == ISD::ANY_EXTEND || Op == ISD::BITCAST) {
3929 Arg = Arg.getOperand(0);
3932 if (Op == ISD::TRUNCATE) {
3933 const SDValue &TruncInput = Arg.getOperand(0);
3934 if (TruncInput.getOpcode() == ISD::AssertZext &&
3935 cast<VTSDNode>(TruncInput.getOperand(1))->getVT() ==
3936 Arg.getValueType()) {
3937 Arg = TruncInput.getOperand(0);
3945 if (Arg.getOpcode() == ISD::CopyFromReg) {
3946 unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
3947 if (!TargetRegisterInfo::isVirtualRegister(VR))
3949 MachineInstr *Def = MRI->getVRegDef(VR);
3952 if (!Flags.isByVal()) {
3953 if (!TII->isLoadFromStackSlot(*Def, FI))
3956 unsigned Opcode = Def->getOpcode();
3957 if ((Opcode == X86::LEA32r || Opcode == X86::LEA64r ||
3958 Opcode == X86::LEA64_32r) &&
3959 Def->getOperand(1).isFI()) {
3960 FI = Def->getOperand(1).getIndex();
3961 Bytes = Flags.getByValSize();
3965 } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
3966 if (Flags.isByVal())
3967 // ByVal argument is passed in as a pointer but it's now being
3968 // dereferenced. e.g.
3969 // define @foo(%struct.X* %A) {
3970 // tail call @bar(%struct.X* byval %A)
3973 SDValue Ptr = Ld->getBasePtr();
3974 FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
3977 FI = FINode->getIndex();
3978 } else if (Arg.getOpcode() == ISD::FrameIndex && Flags.isByVal()) {
3979 FrameIndexSDNode *FINode = cast<FrameIndexSDNode>(Arg);
3980 FI = FINode->getIndex();
3981 Bytes = Flags.getByValSize();
3985 assert(FI != INT_MAX);
3986 if (!MFI.isFixedObjectIndex(FI))
3989 if (Offset != MFI.getObjectOffset(FI))
3992 // If this is not byval, check that the argument stack object is immutable.
3993 // inalloca and argument copy elision can create mutable argument stack
3994 // objects. Byval objects can be mutated, but a byval call intends to pass the
3996 if (!Flags.isByVal() && !MFI.isImmutableObjectIndex(FI))
3999 if (VA.getLocVT().getSizeInBits() > Arg.getValueSizeInBits()) {
4000 // If the argument location is wider than the argument type, check that any
4001 // extension flags match.
4002 if (Flags.isZExt() != MFI.isObjectZExt(FI) ||
4003 Flags.isSExt() != MFI.isObjectSExt(FI)) {
4008 return Bytes == MFI.getObjectSize(FI);
4011 /// Check whether the call is eligible for tail call optimization. Targets
4012 /// that want to do tail call optimization should implement this function.
4013 bool X86TargetLowering::IsEligibleForTailCallOptimization(
4014 SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg,
4015 bool isCalleeStructRet, bool isCallerStructRet, Type *RetTy,
4016 const SmallVectorImpl<ISD::OutputArg> &Outs,
4017 const SmallVectorImpl<SDValue> &OutVals,
4018 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4019 if (!mayTailCallThisCC(CalleeCC))
4022 // If -tailcallopt is specified, make fastcc functions tail-callable.
4023 MachineFunction &MF = DAG.getMachineFunction();
4024 const Function *CallerF = MF.getFunction();
4026 // If the function return type is x86_fp80 and the callee return type is not,
4027 // then the FP_EXTEND of the call result is not a nop. It's not safe to
4028 // perform a tailcall optimization here.
4029 if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
4032 CallingConv::ID CallerCC = CallerF->getCallingConv();
4033 bool CCMatch = CallerCC == CalleeCC;
4034 bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
4035 bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
4037 // Win64 functions have extra shadow space for argument homing. Don't do the
4038 // sibcall if the caller and callee have mismatched expectations for this
4040 if (IsCalleeWin64 != IsCallerWin64)
4043 if (DAG.getTarget().Options.GuaranteedTailCallOpt) {
4044 if (canGuaranteeTCO(CalleeCC) && CCMatch)
4049 // Look for obvious safe cases to perform tail call optimization that do not
4050 // require ABI changes. This is what gcc calls sibcall.
4052 // Can't do sibcall if stack needs to be dynamically re-aligned. PEI needs to
4053 // emit a special epilogue.
4054 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4055 if (RegInfo->needsStackRealignment(MF))
4058 // Also avoid sibcall optimization if either caller or callee uses struct
4059 // return semantics.
4060 if (isCalleeStructRet || isCallerStructRet)
4063 // Do not sibcall optimize vararg calls unless all arguments are passed via
4065 LLVMContext &C = *DAG.getContext();
4066 if (isVarArg && !Outs.empty()) {
4067 // Optimizing for varargs on Win64 is unlikely to be safe without
4068 // additional testing.
4069 if (IsCalleeWin64 || IsCallerWin64)
4072 SmallVector<CCValAssign, 16> ArgLocs;
4073 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4075 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4076 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i)
4077 if (!ArgLocs[i].isRegLoc())
4081 // If the call result is in ST0 / ST1, it needs to be popped off the x87
4082 // stack. Therefore, if it's not used by the call it is not safe to optimize
4083 // this into a sibcall.
4084 bool Unused = false;
4085 for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
4092 SmallVector<CCValAssign, 16> RVLocs;
4093 CCState CCInfo(CalleeCC, false, MF, RVLocs, C);
4094 CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
4095 for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
4096 CCValAssign &VA = RVLocs[i];
4097 if (VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1)
4102 // Check that the call results are passed in the same way.
4103 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
4104 RetCC_X86, RetCC_X86))
4106 // The callee has to preserve all registers the caller needs to preserve.
4107 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
4108 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4110 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4111 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4115 unsigned StackArgsSize = 0;
4117 // If the callee takes no arguments then go on to check the results of the
4119 if (!Outs.empty()) {
4120 // Check if stack adjustment is needed. For now, do not do this if any
4121 // argument is passed on the stack.
4122 SmallVector<CCValAssign, 16> ArgLocs;
4123 CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
4125 // Allocate shadow area for Win64
4127 CCInfo.AllocateStack(32, 8);
4129 CCInfo.AnalyzeCallOperands(Outs, CC_X86);
4130 StackArgsSize = CCInfo.getNextStackOffset();
4132 if (CCInfo.getNextStackOffset()) {
4133 // Check if the arguments are already laid out in the right way as
4134 // the caller's fixed stack objects.
4135 MachineFrameInfo &MFI = MF.getFrameInfo();
4136 const MachineRegisterInfo *MRI = &MF.getRegInfo();
4137 const X86InstrInfo *TII = Subtarget.getInstrInfo();
4138 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4139 CCValAssign &VA = ArgLocs[i];
4140 SDValue Arg = OutVals[i];
4141 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4142 if (VA.getLocInfo() == CCValAssign::Indirect)
4144 if (!VA.isRegLoc()) {
4145 if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
4152 bool PositionIndependent = isPositionIndependent();
4153 // If the tailcall address may be in a register, then make sure it's
4154 // possible to register allocate for it. In 32-bit, the call address can
4155 // only target EAX, EDX, or ECX since the tail call must be scheduled after
4156 // callee-saved registers are restored. These happen to be the same
4157 // registers used to pass 'inreg' arguments so watch out for those.
4158 if (!Subtarget.is64Bit() && ((!isa<GlobalAddressSDNode>(Callee) &&
4159 !isa<ExternalSymbolSDNode>(Callee)) ||
4160 PositionIndependent)) {
4161 unsigned NumInRegs = 0;
4162 // In PIC we need an extra register to formulate the address computation
4164 unsigned MaxInRegs = PositionIndependent ? 2 : 3;
4166 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4167 CCValAssign &VA = ArgLocs[i];
4170 unsigned Reg = VA.getLocReg();
4173 case X86::EAX: case X86::EDX: case X86::ECX:
4174 if (++NumInRegs == MaxInRegs)
4181 const MachineRegisterInfo &MRI = MF.getRegInfo();
4182 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
4186 bool CalleeWillPop =
4187 X86::isCalleePop(CalleeCC, Subtarget.is64Bit(), isVarArg,
4188 MF.getTarget().Options.GuaranteedTailCallOpt);
4190 if (unsigned BytesToPop =
4191 MF.getInfo<X86MachineFunctionInfo>()->getBytesToPopOnReturn()) {
4192 // If we have bytes to pop, the callee must pop them.
4193 bool CalleePopMatches = CalleeWillPop && BytesToPop == StackArgsSize;
4194 if (!CalleePopMatches)
4196 } else if (CalleeWillPop && StackArgsSize > 0) {
4197 // If we don't have bytes to pop, make sure the callee doesn't pop any.
4205 X86TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
4206 const TargetLibraryInfo *libInfo) const {
4207 return X86::createFastISel(funcInfo, libInfo);
4210 //===----------------------------------------------------------------------===//
4211 // Other Lowering Hooks
4212 //===----------------------------------------------------------------------===//
4214 static bool MayFoldLoad(SDValue Op) {
4215 return Op.hasOneUse() && ISD::isNormalLoad(Op.getNode());
4218 static bool MayFoldIntoStore(SDValue Op) {
4219 return Op.hasOneUse() && ISD::isNormalStore(*Op.getNode()->use_begin());
4222 static bool MayFoldIntoZeroExtend(SDValue Op) {
4223 if (Op.hasOneUse()) {
4224 unsigned Opcode = Op.getNode()->use_begin()->getOpcode();
4225 return (ISD::ZERO_EXTEND == Opcode);
4230 static bool isTargetShuffle(unsigned Opcode) {
4232 default: return false;
4233 case X86ISD::BLENDI:
4234 case X86ISD::PSHUFB:
4235 case X86ISD::PSHUFD:
4236 case X86ISD::PSHUFHW:
4237 case X86ISD::PSHUFLW:
4239 case X86ISD::INSERTPS:
4240 case X86ISD::EXTRQI:
4241 case X86ISD::INSERTQI:
4242 case X86ISD::PALIGNR:
4243 case X86ISD::VSHLDQ:
4244 case X86ISD::VSRLDQ:
4245 case X86ISD::MOVLHPS:
4246 case X86ISD::MOVLHPD:
4247 case X86ISD::MOVHLPS:
4248 case X86ISD::MOVLPS:
4249 case X86ISD::MOVLPD:
4250 case X86ISD::MOVSHDUP:
4251 case X86ISD::MOVSLDUP:
4252 case X86ISD::MOVDDUP:
4255 case X86ISD::UNPCKL:
4256 case X86ISD::UNPCKH:
4257 case X86ISD::VBROADCAST:
4258 case X86ISD::VPERMILPI:
4259 case X86ISD::VPERMILPV:
4260 case X86ISD::VPERM2X128:
4261 case X86ISD::VPERMIL2:
4262 case X86ISD::VPERMI:
4263 case X86ISD::VPPERM:
4264 case X86ISD::VPERMV:
4265 case X86ISD::VPERMV3:
4266 case X86ISD::VPERMIV3:
4267 case X86ISD::VZEXT_MOVL:
4272 static bool isTargetShuffleVariableMask(unsigned Opcode) {
4274 default: return false;
4276 case X86ISD::PSHUFB:
4277 case X86ISD::VPERMILPV:
4278 case X86ISD::VPERMIL2:
4279 case X86ISD::VPPERM:
4280 case X86ISD::VPERMV:
4281 case X86ISD::VPERMV3:
4282 case X86ISD::VPERMIV3:
4284 // 'Faux' Target Shuffles.
4291 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
4292 MachineFunction &MF = DAG.getMachineFunction();
4293 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
4294 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
4295 int ReturnAddrIndex = FuncInfo->getRAIndex();
4297 if (ReturnAddrIndex == 0) {
4298 // Set up a frame object for the return address.
4299 unsigned SlotSize = RegInfo->getSlotSize();
4300 ReturnAddrIndex = MF.getFrameInfo().CreateFixedObject(SlotSize,
4303 FuncInfo->setRAIndex(ReturnAddrIndex);
4306 return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy(DAG.getDataLayout()));
4309 bool X86::isOffsetSuitableForCodeModel(int64_t Offset, CodeModel::Model M,
4310 bool hasSymbolicDisplacement) {
4311 // Offset should fit into 32 bit immediate field.
4312 if (!isInt<32>(Offset))
4315 // If we don't have a symbolic displacement - we don't have any extra
4317 if (!hasSymbolicDisplacement)
4320 // FIXME: Some tweaks might be needed for medium code model.
4321 if (M != CodeModel::Small && M != CodeModel::Kernel)
4324 // For small code model we assume that latest object is 16MB before end of 31
4325 // bits boundary. We may also accept pretty large negative constants knowing
4326 // that all objects are in the positive half of address space.
4327 if (M == CodeModel::Small && Offset < 16*1024*1024)
4330 // For kernel code model we know that all object resist in the negative half
4331 // of 32bits address space. We may not accept negative offsets, since they may
4332 // be just off and we may accept pretty large positive ones.
4333 if (M == CodeModel::Kernel && Offset >= 0)
4339 /// Determines whether the callee is required to pop its own arguments.
4340 /// Callee pop is necessary to support tail calls.
4341 bool X86::isCalleePop(CallingConv::ID CallingConv,
4342 bool is64Bit, bool IsVarArg, bool GuaranteeTCO) {
4343 // If GuaranteeTCO is true, we force some calls to be callee pop so that we
4344 // can guarantee TCO.
4345 if (!IsVarArg && shouldGuaranteeTCO(CallingConv, GuaranteeTCO))
4348 switch (CallingConv) {
4351 case CallingConv::X86_StdCall:
4352 case CallingConv::X86_FastCall:
4353 case CallingConv::X86_ThisCall:
4354 case CallingConv::X86_VectorCall:
4359 /// \brief Return true if the condition is an unsigned comparison operation.
4360 static bool isX86CCUnsigned(unsigned X86CC) {
4363 llvm_unreachable("Invalid integer condition!");
4379 static X86::CondCode TranslateIntegerX86CC(ISD::CondCode SetCCOpcode) {
4380 switch (SetCCOpcode) {
4381 default: llvm_unreachable("Invalid integer condition!");
4382 case ISD::SETEQ: return X86::COND_E;
4383 case ISD::SETGT: return X86::COND_G;
4384 case ISD::SETGE: return X86::COND_GE;
4385 case ISD::SETLT: return X86::COND_L;
4386 case ISD::SETLE: return X86::COND_LE;
4387 case ISD::SETNE: return X86::COND_NE;
4388 case ISD::SETULT: return X86::COND_B;
4389 case ISD::SETUGT: return X86::COND_A;
4390 case ISD::SETULE: return X86::COND_BE;
4391 case ISD::SETUGE: return X86::COND_AE;
4395 /// Do a one-to-one translation of a ISD::CondCode to the X86-specific
4396 /// condition code, returning the condition code and the LHS/RHS of the
4397 /// comparison to make.
4398 static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
4399 bool isFP, SDValue &LHS, SDValue &RHS,
4400 SelectionDAG &DAG) {
4402 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
4403 if (SetCCOpcode == ISD::SETGT && RHSC->isAllOnesValue()) {
4404 // X > -1 -> X == 0, jump !sign.
4405 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4406 return X86::COND_NS;
4408 if (SetCCOpcode == ISD::SETLT && RHSC->isNullValue()) {
4409 // X < 0 -> X == 0, jump on sign.
4412 if (SetCCOpcode == ISD::SETLT && RHSC->getZExtValue() == 1) {
4414 RHS = DAG.getConstant(0, DL, RHS.getValueType());
4415 return X86::COND_LE;
4419 return TranslateIntegerX86CC(SetCCOpcode);
4422 // First determine if it is required or is profitable to flip the operands.
4424 // If LHS is a foldable load, but RHS is not, flip the condition.
4425 if (ISD::isNON_EXTLoad(LHS.getNode()) &&
4426 !ISD::isNON_EXTLoad(RHS.getNode())) {
4427 SetCCOpcode = getSetCCSwappedOperands(SetCCOpcode);
4428 std::swap(LHS, RHS);
4431 switch (SetCCOpcode) {
4437 std::swap(LHS, RHS);
4441 // On a floating point condition, the flags are set as follows:
4443 // 0 | 0 | 0 | X > Y
4444 // 0 | 0 | 1 | X < Y
4445 // 1 | 0 | 0 | X == Y
4446 // 1 | 1 | 1 | unordered
4447 switch (SetCCOpcode) {
4448 default: llvm_unreachable("Condcode should be pre-legalized away");
4450 case ISD::SETEQ: return X86::COND_E;
4451 case ISD::SETOLT: // flipped
4453 case ISD::SETGT: return X86::COND_A;
4454 case ISD::SETOLE: // flipped
4456 case ISD::SETGE: return X86::COND_AE;
4457 case ISD::SETUGT: // flipped
4459 case ISD::SETLT: return X86::COND_B;
4460 case ISD::SETUGE: // flipped
4462 case ISD::SETLE: return X86::COND_BE;
4464 case ISD::SETNE: return X86::COND_NE;
4465 case ISD::SETUO: return X86::COND_P;
4466 case ISD::SETO: return X86::COND_NP;
4468 case ISD::SETUNE: return X86::COND_INVALID;
4472 /// Is there a floating point cmov for the specific X86 condition code?
4473 /// Current x86 isa includes the following FP cmov instructions:
4474 /// fcmovb, fcomvbe, fcomve, fcmovu, fcmovae, fcmova, fcmovne, fcmovnu.
4475 static bool hasFPCMov(unsigned X86CC) {
4492 bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
4494 unsigned Intrinsic) const {
4496 const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
4500 Info.opc = ISD::INTRINSIC_W_CHAIN;
4501 Info.readMem = false;
4502 Info.writeMem = false;
4506 switch (IntrData->Type) {
4507 case EXPAND_FROM_MEM: {
4508 Info.ptrVal = I.getArgOperand(0);
4509 Info.memVT = MVT::getVT(I.getType());
4511 Info.readMem = true;
4514 case COMPRESS_TO_MEM: {
4515 Info.ptrVal = I.getArgOperand(0);
4516 Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
4518 Info.writeMem = true;
4521 case TRUNCATE_TO_MEM_VI8:
4522 case TRUNCATE_TO_MEM_VI16:
4523 case TRUNCATE_TO_MEM_VI32: {
4524 Info.ptrVal = I.getArgOperand(0);
4525 MVT VT = MVT::getVT(I.getArgOperand(1)->getType());
4526 MVT ScalarVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
4527 if (IntrData->Type == TRUNCATE_TO_MEM_VI8)
4529 else if (IntrData->Type == TRUNCATE_TO_MEM_VI16)
4530 ScalarVT = MVT::i16;
4531 else if (IntrData->Type == TRUNCATE_TO_MEM_VI32)
4532 ScalarVT = MVT::i32;
4534 Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
4536 Info.writeMem = true;
4546 /// Returns true if the target can instruction select the
4547 /// specified FP immediate natively. If false, the legalizer will
4548 /// materialize the FP immediate as a load from a constant pool.
4549 bool X86TargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
4550 for (unsigned i = 0, e = LegalFPImmediates.size(); i != e; ++i) {
4551 if (Imm.bitwiseIsEqual(LegalFPImmediates[i]))
4557 bool X86TargetLowering::shouldReduceLoadWidth(SDNode *Load,
4558 ISD::LoadExtType ExtTy,
4560 // "ELF Handling for Thread-Local Storage" specifies that R_X86_64_GOTTPOFF
4561 // relocation target a movq or addq instruction: don't let the load shrink.
4562 SDValue BasePtr = cast<LoadSDNode>(Load)->getBasePtr();
4563 if (BasePtr.getOpcode() == X86ISD::WrapperRIP)
4564 if (const auto *GA = dyn_cast<GlobalAddressSDNode>(BasePtr.getOperand(0)))
4565 return GA->getTargetFlags() != X86II::MO_GOTTPOFF;
4569 /// \brief Returns true if it is beneficial to convert a load of a constant
4570 /// to just the constant itself.
4571 bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
4573 assert(Ty->isIntegerTy());
4575 unsigned BitSize = Ty->getPrimitiveSizeInBits();
4576 if (BitSize == 0 || BitSize > 64)
4581 bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
4582 unsigned Index) const {
4583 if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
4586 return (Index == 0 || Index == ResVT.getVectorNumElements());
4589 bool X86TargetLowering::isCheapToSpeculateCttz() const {
4590 // Speculate cttz only if we can directly use TZCNT.
4591 return Subtarget.hasBMI();
4594 bool X86TargetLowering::isCheapToSpeculateCtlz() const {
4595 // Speculate ctlz only if we can directly use LZCNT.
4596 return Subtarget.hasLZCNT();
4599 bool X86TargetLowering::isCtlzFast() const {
4600 return Subtarget.hasFastLZCNT();
4603 bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
4604 const Instruction &AndI) const {
4608 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
4609 if (!Subtarget.hasBMI())
4612 // There are only 32-bit and 64-bit forms for 'andn'.
4613 EVT VT = Y.getValueType();
4614 if (VT != MVT::i32 && VT != MVT::i64)
4620 MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
4621 MVT VT = MVT::getIntegerVT(NumBits);
4622 if (isTypeLegal(VT))
4625 // PMOVMSKB can handle this.
4626 if (NumBits == 128 && isTypeLegal(MVT::v16i8))
4629 // VPMOVMSKB can handle this.
4630 if (NumBits == 256 && isTypeLegal(MVT::v32i8))
4633 // TODO: Allow 64-bit type for 32-bit target.
4634 // TODO: 512-bit types should be allowed, but make sure that those
4635 // cases are handled in combineVectorSizedSetCCEquality().
4637 return MVT::INVALID_SIMPLE_VALUE_TYPE;
4640 /// Val is the undef sentinel value or equal to the specified value.
4641 static bool isUndefOrEqual(int Val, int CmpVal) {
4642 return ((Val == SM_SentinelUndef) || (Val == CmpVal));
4645 /// Val is either the undef or zero sentinel value.
4646 static bool isUndefOrZero(int Val) {
4647 return ((Val == SM_SentinelUndef) || (Val == SM_SentinelZero));
4650 /// Return true if every element in Mask, beginning
4651 /// from position Pos and ending in Pos+Size is the undef sentinel value.
4652 static bool isUndefInRange(ArrayRef<int> Mask, unsigned Pos, unsigned Size) {
4653 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4654 if (Mask[i] != SM_SentinelUndef)
4659 /// Return true if Val is undef or if its value falls within the
4660 /// specified range (L, H].
4661 static bool isUndefOrInRange(int Val, int Low, int Hi) {
4662 return (Val == SM_SentinelUndef) || (Val >= Low && Val < Hi);
4665 /// Return true if every element in Mask is undef or if its value
4666 /// falls within the specified range (L, H].
4667 static bool isUndefOrInRange(ArrayRef<int> Mask,
4670 if (!isUndefOrInRange(M, Low, Hi))
4675 /// Return true if Val is undef, zero or if its value falls within the
4676 /// specified range (L, H].
4677 static bool isUndefOrZeroOrInRange(int Val, int Low, int Hi) {
4678 return isUndefOrZero(Val) || (Val >= Low && Val < Hi);
4681 /// Return true if every element in Mask is undef, zero or if its value
4682 /// falls within the specified range (L, H].
4683 static bool isUndefOrZeroOrInRange(ArrayRef<int> Mask, int Low, int Hi) {
4685 if (!isUndefOrZeroOrInRange(M, Low, Hi))
4690 /// Return true if every element in Mask, beginning
4691 /// from position Pos and ending in Pos+Size, falls within the specified
4692 /// sequential range (Low, Low+Size]. or is undef.
4693 static bool isSequentialOrUndefInRange(ArrayRef<int> Mask,
4694 unsigned Pos, unsigned Size, int Low) {
4695 for (unsigned i = Pos, e = Pos+Size; i != e; ++i, ++Low)
4696 if (!isUndefOrEqual(Mask[i], Low))
4701 /// Return true if every element in Mask, beginning
4702 /// from position Pos and ending in Pos+Size, falls within the specified
4703 /// sequential range (Low, Low+Size], or is undef or is zero.
4704 static bool isSequentialOrUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4705 unsigned Size, int Low) {
4706 for (unsigned i = Pos, e = Pos + Size; i != e; ++i, ++Low)
4707 if (!isUndefOrZero(Mask[i]) && Mask[i] != Low)
4712 /// Return true if every element in Mask, beginning
4713 /// from position Pos and ending in Pos+Size is undef or is zero.
4714 static bool isUndefOrZeroInRange(ArrayRef<int> Mask, unsigned Pos,
4716 for (unsigned i = Pos, e = Pos + Size; i != e; ++i)
4717 if (!isUndefOrZero(Mask[i]))
4722 /// \brief Helper function to test whether a shuffle mask could be
4723 /// simplified by widening the elements being shuffled.
4725 /// Appends the mask for wider elements in WidenedMask if valid. Otherwise
4726 /// leaves it in an unspecified state.
4728 /// NOTE: This must handle normal vector shuffle masks and *target* vector
4729 /// shuffle masks. The latter have the special property of a '-2' representing
4730 /// a zero-ed lane of a vector.
4731 static bool canWidenShuffleElements(ArrayRef<int> Mask,
4732 SmallVectorImpl<int> &WidenedMask) {
4733 WidenedMask.assign(Mask.size() / 2, 0);
4734 for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
4736 int M1 = Mask[i + 1];
4738 // If both elements are undef, its trivial.
4739 if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
4740 WidenedMask[i / 2] = SM_SentinelUndef;
4744 // Check for an undef mask and a mask value properly aligned to fit with
4745 // a pair of values. If we find such a case, use the non-undef mask's value.
4746 if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
4747 WidenedMask[i / 2] = M1 / 2;
4750 if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
4751 WidenedMask[i / 2] = M0 / 2;
4755 // When zeroing, we need to spread the zeroing across both lanes to widen.
4756 if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
4757 if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
4758 (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
4759 WidenedMask[i / 2] = SM_SentinelZero;
4765 // Finally check if the two mask values are adjacent and aligned with
4767 if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
4768 WidenedMask[i / 2] = M0 / 2;
4772 // Otherwise we can't safely widen the elements used in this shuffle.
4775 assert(WidenedMask.size() == Mask.size() / 2 &&
4776 "Incorrect size of mask after widening the elements!");
4781 /// Helper function to scale a shuffle or target shuffle mask, replacing each
4782 /// mask index with the scaled sequential indices for an equivalent narrowed
4783 /// mask. This is the reverse process to canWidenShuffleElements, but can always
4785 static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
4786 SmallVectorImpl<int> &ScaledMask) {
4787 assert(0 < Scale && "Unexpected scaling factor");
4788 int NumElts = Mask.size();
4789 ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
4791 for (int i = 0; i != NumElts; ++i) {
4794 // Repeat sentinel values in every mask element.
4796 for (int s = 0; s != Scale; ++s)
4797 ScaledMask[(Scale * i) + s] = M;
4801 // Scale mask element and increment across each mask element.
4802 for (int s = 0; s != Scale; ++s)
4803 ScaledMask[(Scale * i) + s] = (Scale * M) + s;
4807 /// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
4808 /// extract that is suitable for instruction that extract 128 or 256 bit vectors
4809 static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
4810 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4811 if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
4814 // The index should be aligned on a vecWidth-bit boundary.
4815 uint64_t Index = N->getConstantOperandVal(1);
4816 MVT VT = N->getSimpleValueType(0);
4817 unsigned ElSize = VT.getScalarSizeInBits();
4818 return (Index * ElSize) % vecWidth == 0;
4821 /// Return true if the specified INSERT_SUBVECTOR
4822 /// operand specifies a subvector insert that is suitable for input to
4823 /// insertion of 128 or 256-bit subvectors
4824 static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
4825 assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
4826 if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
4829 // The index should be aligned on a vecWidth-bit boundary.
4830 uint64_t Index = N->getConstantOperandVal(2);
4831 MVT VT = N->getSimpleValueType(0);
4832 unsigned ElSize = VT.getScalarSizeInBits();
4833 return (Index * ElSize) % vecWidth == 0;
4836 bool X86::isVINSERT128Index(SDNode *N) {
4837 return isVINSERTIndex(N, 128);
4840 bool X86::isVINSERT256Index(SDNode *N) {
4841 return isVINSERTIndex(N, 256);
4844 bool X86::isVEXTRACT128Index(SDNode *N) {
4845 return isVEXTRACTIndex(N, 128);
4848 bool X86::isVEXTRACT256Index(SDNode *N) {
4849 return isVEXTRACTIndex(N, 256);
4852 static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
4853 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4854 assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
4855 "Illegal extract subvector for VEXTRACT");
4857 uint64_t Index = N->getConstantOperandVal(1);
4858 MVT VecVT = N->getOperand(0).getSimpleValueType();
4859 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4860 return Index / NumElemsPerChunk;
4863 static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
4864 assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
4865 assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
4866 "Illegal insert subvector for VINSERT");
4868 uint64_t Index = N->getConstantOperandVal(2);
4869 MVT VecVT = N->getSimpleValueType(0);
4870 unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
4871 return Index / NumElemsPerChunk;
4874 /// Return the appropriate immediate to extract the specified
4875 /// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
4876 unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
4877 return getExtractVEXTRACTImmediate(N, 128);
4880 /// Return the appropriate immediate to extract the specified
4881 /// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
4882 unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
4883 return getExtractVEXTRACTImmediate(N, 256);
4886 /// Return the appropriate immediate to insert at the specified
4887 /// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
4888 unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
4889 return getInsertVINSERTImmediate(N, 128);
4892 /// Return the appropriate immediate to insert at the specified
4893 /// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
4894 unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
4895 return getInsertVINSERTImmediate(N, 256);
4898 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
4899 bool X86::isZeroNode(SDValue Elt) {
4900 return isNullConstant(Elt) || isNullFPConstant(Elt);
4903 // Build a vector of constants.
4904 // Use an UNDEF node if MaskElt == -1.
4905 // Split 64-bit constants in the 32-bit mode.
4906 static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
4907 const SDLoc &dl, bool IsMask = false) {
4909 SmallVector<SDValue, 32> Ops;
4912 MVT ConstVecVT = VT;
4913 unsigned NumElts = VT.getVectorNumElements();
4914 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4915 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4916 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4920 MVT EltVT = ConstVecVT.getVectorElementType();
4921 for (unsigned i = 0; i < NumElts; ++i) {
4922 bool IsUndef = Values[i] < 0 && IsMask;
4923 SDValue OpNode = IsUndef ? DAG.getUNDEF(EltVT) :
4924 DAG.getConstant(Values[i], dl, EltVT);
4925 Ops.push_back(OpNode);
4927 Ops.push_back(IsUndef ? DAG.getUNDEF(EltVT) :
4928 DAG.getConstant(0, dl, EltVT));
4930 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4932 ConstsNode = DAG.getBitcast(VT, ConstsNode);
4936 static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
4937 MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
4938 assert(Bits.size() == Undefs.getBitWidth() &&
4939 "Unequal constant and undef arrays");
4940 SmallVector<SDValue, 32> Ops;
4943 MVT ConstVecVT = VT;
4944 unsigned NumElts = VT.getVectorNumElements();
4945 bool In64BitMode = DAG.getTargetLoweringInfo().isTypeLegal(MVT::i64);
4946 if (!In64BitMode && VT.getVectorElementType() == MVT::i64) {
4947 ConstVecVT = MVT::getVectorVT(MVT::i32, NumElts * 2);
4951 MVT EltVT = ConstVecVT.getVectorElementType();
4952 for (unsigned i = 0, e = Bits.size(); i != e; ++i) {
4954 Ops.append(Split ? 2 : 1, DAG.getUNDEF(EltVT));
4957 const APInt &V = Bits[i];
4958 assert(V.getBitWidth() == VT.getScalarSizeInBits() && "Unexpected sizes");
4960 Ops.push_back(DAG.getConstant(V.trunc(32), dl, EltVT));
4961 Ops.push_back(DAG.getConstant(V.lshr(32).trunc(32), dl, EltVT));
4962 } else if (EltVT == MVT::f32) {
4963 APFloat FV(APFloat::IEEEsingle(), V);
4964 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4965 } else if (EltVT == MVT::f64) {
4966 APFloat FV(APFloat::IEEEdouble(), V);
4967 Ops.push_back(DAG.getConstantFP(FV, dl, EltVT));
4969 Ops.push_back(DAG.getConstant(V, dl, EltVT));
4973 SDValue ConstsNode = DAG.getBuildVector(ConstVecVT, dl, Ops);
4974 return DAG.getBitcast(VT, ConstsNode);
4977 /// Returns a vector of specified type with all zero elements.
4978 static SDValue getZeroVector(MVT VT, const X86Subtarget &Subtarget,
4979 SelectionDAG &DAG, const SDLoc &dl) {
4980 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector() ||
4981 VT.getVectorElementType() == MVT::i1) &&
4982 "Unexpected vector type");
4984 // Try to build SSE/AVX zero vectors as <N x i32> bitcasted to their dest
4985 // type. This ensures they get CSE'd. But if the integer type is not
4986 // available, use a floating-point +0.0 instead.
4988 if (!Subtarget.hasSSE2() && VT.is128BitVector()) {
4989 Vec = DAG.getConstantFP(+0.0, dl, MVT::v4f32);
4990 } else if (VT.getVectorElementType() == MVT::i1) {
4991 assert((Subtarget.hasBWI() || VT.getVectorNumElements() <= 16) &&
4992 "Unexpected vector type");
4993 assert((Subtarget.hasVLX() || VT.getVectorNumElements() >= 8) &&
4994 "Unexpected vector type");
4995 Vec = DAG.getConstant(0, dl, VT);
4997 unsigned Num32BitElts = VT.getSizeInBits() / 32;
4998 Vec = DAG.getConstant(0, dl, MVT::getVectorVT(MVT::i32, Num32BitElts));
5000 return DAG.getBitcast(VT, Vec);
5003 static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
5004 const SDLoc &dl, unsigned vectorWidth) {
5005 EVT VT = Vec.getValueType();
5006 EVT ElVT = VT.getVectorElementType();
5007 unsigned Factor = VT.getSizeInBits()/vectorWidth;
5008 EVT ResultVT = EVT::getVectorVT(*DAG.getContext(), ElVT,
5009 VT.getVectorNumElements()/Factor);
5011 // Extract the relevant vectorWidth bits. Generate an EXTRACT_SUBVECTOR
5012 unsigned ElemsPerChunk = vectorWidth / ElVT.getSizeInBits();
5013 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5015 // This is the index of the first element of the vectorWidth-bit chunk
5016 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5017 IdxVal &= ~(ElemsPerChunk - 1);
5019 // If the input is a buildvector just emit a smaller one.
5020 if (Vec.getOpcode() == ISD::BUILD_VECTOR)
5021 return DAG.getBuildVector(
5022 ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
5024 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5025 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
5028 /// Generate a DAG to grab 128-bits from a vector > 128 bits. This
5029 /// sets things up to match to an AVX VEXTRACTF128 / VEXTRACTI128
5030 /// or AVX-512 VEXTRACTF32x4 / VEXTRACTI32x4
5031 /// instructions or a simple subregister reference. Idx is an index in the
5032 /// 128 bits we want. It need not be aligned to a 128-bit boundary. That makes
5033 /// lowering EXTRACT_VECTOR_ELT operations easier.
5034 static SDValue extract128BitVector(SDValue Vec, unsigned IdxVal,
5035 SelectionDAG &DAG, const SDLoc &dl) {
5036 assert((Vec.getValueType().is256BitVector() ||
5037 Vec.getValueType().is512BitVector()) && "Unexpected vector size!");
5038 return extractSubVector(Vec, IdxVal, DAG, dl, 128);
5041 /// Generate a DAG to grab 256-bits from a 512-bit vector.
5042 static SDValue extract256BitVector(SDValue Vec, unsigned IdxVal,
5043 SelectionDAG &DAG, const SDLoc &dl) {
5044 assert(Vec.getValueType().is512BitVector() && "Unexpected vector size!");
5045 return extractSubVector(Vec, IdxVal, DAG, dl, 256);
5048 static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5049 SelectionDAG &DAG, const SDLoc &dl,
5050 unsigned vectorWidth) {
5051 assert((vectorWidth == 128 || vectorWidth == 256) &&
5052 "Unsupported vector width");
5053 // Inserting UNDEF is Result
5056 EVT VT = Vec.getValueType();
5057 EVT ElVT = VT.getVectorElementType();
5058 EVT ResultVT = Result.getValueType();
5060 // Insert the relevant vectorWidth bits.
5061 unsigned ElemsPerChunk = vectorWidth/ElVT.getSizeInBits();
5062 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
5064 // This is the index of the first element of the vectorWidth-bit chunk
5065 // we want. Since ElemsPerChunk is a power of 2 just need to clear bits.
5066 IdxVal &= ~(ElemsPerChunk - 1);
5068 SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
5069 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Result, Vec, VecIdx);
5072 /// Generate a DAG to put 128-bits into a vector > 128 bits. This
5073 /// sets things up to match to an AVX VINSERTF128/VINSERTI128 or
5074 /// AVX-512 VINSERTF32x4/VINSERTI32x4 instructions or a
5075 /// simple superregister reference. Idx is an index in the 128 bits
5076 /// we want. It need not be aligned to a 128-bit boundary. That makes
5077 /// lowering INSERT_VECTOR_ELT operations easier.
5078 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5079 SelectionDAG &DAG, const SDLoc &dl) {
5080 assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
5081 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
5084 static SDValue insert256BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
5085 SelectionDAG &DAG, const SDLoc &dl) {
5086 assert(Vec.getValueType().is256BitVector() && "Unexpected vector size!");
5087 return insertSubVector(Result, Vec, IdxVal, DAG, dl, 256);
5090 // Return true if the instruction zeroes the unused upper part of the
5091 // destination and accepts mask.
5092 static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
5096 case X86ISD::PCMPEQM:
5097 case X86ISD::PCMPGTM:
5104 /// Insert i1-subvector to i1-vector.
5105 static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
5106 const X86Subtarget &Subtarget) {
5109 SDValue Vec = Op.getOperand(0);
5110 SDValue SubVec = Op.getOperand(1);
5111 SDValue Idx = Op.getOperand(2);
5113 if (!isa<ConstantSDNode>(Idx))
5116 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
5117 if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
5120 MVT OpVT = Op.getSimpleValueType();
5121 MVT SubVecVT = SubVec.getSimpleValueType();
5122 unsigned NumElems = OpVT.getVectorNumElements();
5123 unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
5125 assert(IdxVal + SubVecNumElems <= NumElems &&
5126 IdxVal % SubVecVT.getSizeInBits() == 0 &&
5127 "Unexpected index value in INSERT_SUBVECTOR");
5129 // There are 3 possible cases:
5130 // 1. Subvector should be inserted in the lower part (IdxVal == 0)
5131 // 2. Subvector should be inserted in the upper part
5132 // (IdxVal + SubVecNumElems == NumElems)
5133 // 3. Subvector should be inserted in the middle (for example v2i1
5134 // to v16i1, index 2)
5136 // If this node widens - by concatenating zeroes - the type of the result
5137 // of a node with instruction that zeroes all upper (irrelevant) bits of the
5138 // output register, mark this node as legal to enable replacing them with
5139 // the v8i1 version of the previous instruction during instruction selection.
5140 // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
5141 // while zeroing all the upper remaining 60 bits of the register. if the
5142 // result of such instruction is inserted into an allZeroVector, then we can
5143 // safely remove insert_vector (in instruction selection) as the cmp instr
5144 // already zeroed the rest of the register.
5145 if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
5146 (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
5147 (SubVec.getOpcode() == ISD::AND &&
5148 (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
5149 isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
5152 // extend to natively supported kshift
5153 MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
5154 MVT WideOpVT = OpVT;
5155 if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
5158 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
5159 SDValue Undef = DAG.getUNDEF(WideOpVT);
5160 SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5161 Undef, SubVec, ZeroIdx);
5163 // Extract sub-vector if require.
5164 auto ExtractSubVec = [&](SDValue V) {
5165 return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
5169 if (Vec.isUndef()) {
5171 SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
5172 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5175 return ExtractSubVec(WideSubVec);
5178 if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
5179 NumElems = WideOpVT.getVectorNumElements();
5180 unsigned ShiftLeft = NumElems - SubVecNumElems;
5181 unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
5182 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5183 DAG.getConstant(ShiftLeft, dl, MVT::i8));
5184 Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
5185 DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
5186 return ExtractSubVec(Vec);
5190 // Zero lower bits of the Vec
5191 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5192 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5193 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5194 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5195 // Merge them together, SubVec should be zero extended.
5196 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
5197 getZeroVector(WideOpVT, Subtarget, DAG, dl),
5199 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5200 return ExtractSubVec(Vec);
5203 // Simple case when we put subvector in the upper part
5204 if (IdxVal + SubVecNumElems == NumElems) {
5205 // Zero upper bits of the Vec
5206 WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
5207 DAG.getConstant(IdxVal, dl, MVT::i8));
5208 SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
5209 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
5210 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
5211 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
5212 Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
5213 return ExtractSubVec(Vec);
5215 // Subvector should be inserted in the middle - use shuffle
5216 WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
5218 SmallVector<int, 64> Mask;
5219 for (unsigned i = 0; i < NumElems; ++i)
5220 Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
5222 return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
5225 /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
5226 /// instructions. This is used because creating CONCAT_VECTOR nodes of
5227 /// BUILD_VECTORS returns a larger BUILD_VECTOR while we're trying to lower
5228 /// large BUILD_VECTORS.
5229 static SDValue concat128BitVectors(SDValue V1, SDValue V2, EVT VT,
5230 unsigned NumElems, SelectionDAG &DAG,
5232 SDValue V = insert128BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5233 return insert128BitVector(V, V2, NumElems / 2, DAG, dl);
5236 static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
5237 unsigned NumElems, SelectionDAG &DAG,
5239 SDValue V = insert256BitVector(DAG.getUNDEF(VT), V1, 0, DAG, dl);
5240 return insert256BitVector(V, V2, NumElems / 2, DAG, dl);
5243 /// Returns a vector of specified type with all bits set.
5244 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
5245 /// Then bitcast to their original type, ensuring they get CSE'd.
5246 static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
5247 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
5248 "Expected a 128/256/512-bit vector type");
5250 APInt Ones = APInt::getAllOnesValue(32);
5251 unsigned NumElts = VT.getSizeInBits() / 32;
5252 SDValue Vec = DAG.getConstant(Ones, dl, MVT::getVectorVT(MVT::i32, NumElts));
5253 return DAG.getBitcast(VT, Vec);
5256 static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
5257 SelectionDAG &DAG) {
5258 EVT InVT = In.getValueType();
5259 assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
5261 if (VT.is128BitVector() && InVT.is128BitVector())
5262 return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
5263 : DAG.getZeroExtendVectorInReg(In, DL, VT);
5265 // For 256-bit vectors, we only need the lower (128-bit) input half.
5266 // For 512-bit vectors, we only need the lower input half or quarter.
5267 if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
5268 int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
5269 In = extractSubVector(In, 0, DAG, DL,
5270 std::max(128, (int)VT.getSizeInBits() / Scale));
5273 return DAG.getNode(Opc, DL, VT, In);
5276 /// Generate unpacklo/unpackhi shuffle mask.
5277 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
5279 assert(Mask.empty() && "Expected an empty shuffle mask vector");
5280 int NumElts = VT.getVectorNumElements();
5281 int NumEltsInLane = 128 / VT.getScalarSizeInBits();
5283 for (int i = 0; i < NumElts; ++i) {
5284 unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
5285 int Pos = (i % NumEltsInLane) / 2 + LaneStart;
5286 Pos += (Unary ? 0 : NumElts * (i % 2));
5287 Pos += (Lo ? 0 : NumEltsInLane / 2);
5288 Mask.push_back(Pos);
5292 /// Returns a vector_shuffle node for an unpackl operation.
5293 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5294 SDValue V1, SDValue V2) {
5295 SmallVector<int, 8> Mask;
5296 createUnpackShuffleMask(VT, Mask, /* Lo = */ true, /* Unary = */ false);
5297 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5300 /// Returns a vector_shuffle node for an unpackh operation.
5301 static SDValue getUnpackh(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
5302 SDValue V1, SDValue V2) {
5303 SmallVector<int, 8> Mask;
5304 createUnpackShuffleMask(VT, Mask, /* Lo = */ false, /* Unary = */ false);
5305 return DAG.getVectorShuffle(VT, dl, V1, V2, Mask);
5308 /// Return a vector_shuffle of the specified vector of zero or undef vector.
5309 /// This produces a shuffle where the low element of V2 is swizzled into the
5310 /// zero/undef vector, landing at element Idx.
5311 /// This produces a shuffle mask like 4,1,2,3 (idx=0) or 0,1,2,4 (idx=3).
5312 static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
5314 const X86Subtarget &Subtarget,
5315 SelectionDAG &DAG) {
5316 MVT VT = V2.getSimpleValueType();
5318 ? getZeroVector(VT, Subtarget, DAG, SDLoc(V2)) : DAG.getUNDEF(VT);
5319 int NumElems = VT.getVectorNumElements();
5320 SmallVector<int, 16> MaskVec(NumElems);
5321 for (int i = 0; i != NumElems; ++i)
5322 // If this is the insertion idx, put the low elt of V2 here.
5323 MaskVec[i] = (i == Idx) ? NumElems : i;
5324 return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
5327 static SDValue peekThroughBitcasts(SDValue V) {
5328 while (V.getNode() && V.getOpcode() == ISD::BITCAST)
5329 V = V.getOperand(0);
5333 static SDValue peekThroughOneUseBitcasts(SDValue V) {
5334 while (V.getNode() && V.getOpcode() == ISD::BITCAST &&
5335 V.getOperand(0).hasOneUse())
5336 V = V.getOperand(0);
5340 static const Constant *getTargetConstantFromNode(SDValue Op) {
5341 Op = peekThroughBitcasts(Op);
5343 auto *Load = dyn_cast<LoadSDNode>(Op);
5347 SDValue Ptr = Load->getBasePtr();
5348 if (Ptr->getOpcode() == X86ISD::Wrapper ||
5349 Ptr->getOpcode() == X86ISD::WrapperRIP)
5350 Ptr = Ptr->getOperand(0);
5352 auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
5353 if (!CNode || CNode->isMachineConstantPoolEntry())
5356 return dyn_cast<Constant>(CNode->getConstVal());
5359 // Extract raw constant bits from constant pools.
5360 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
5362 SmallVectorImpl<APInt> &EltBits,
5363 bool AllowWholeUndefs = true,
5364 bool AllowPartialUndefs = true) {
5365 assert(EltBits.empty() && "Expected an empty EltBits vector");
5367 Op = peekThroughBitcasts(Op);
5369 EVT VT = Op.getValueType();
5370 unsigned SizeInBits = VT.getSizeInBits();
5371 assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
5372 unsigned NumElts = SizeInBits / EltSizeInBits;
5374 // Bitcast a source array of element bits to the target size.
5375 auto CastBitData = [&](APInt &UndefSrcElts, ArrayRef<APInt> SrcEltBits) {
5376 unsigned NumSrcElts = UndefSrcElts.getBitWidth();
5377 unsigned SrcEltSizeInBits = SrcEltBits[0].getBitWidth();
5378 assert((NumSrcElts * SrcEltSizeInBits) == SizeInBits &&
5379 "Constant bit sizes don't match");
5381 // Don't split if we don't allow undef bits.
5382 bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
5383 if (UndefSrcElts.getBoolValue() && !AllowUndefs)
5386 // If we're already the right size, don't bother bitcasting.
5387 if (NumSrcElts == NumElts) {
5388 UndefElts = UndefSrcElts;
5389 EltBits.assign(SrcEltBits.begin(), SrcEltBits.end());
5393 // Extract all the undef/constant element data and pack into single bitsets.
5394 APInt UndefBits(SizeInBits, 0);
5395 APInt MaskBits(SizeInBits, 0);
5397 for (unsigned i = 0; i != NumSrcElts; ++i) {
5398 unsigned BitOffset = i * SrcEltSizeInBits;
5399 if (UndefSrcElts[i])
5400 UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
5401 MaskBits.insertBits(SrcEltBits[i], BitOffset);
5404 // Split the undef/constant single bitset data into the target elements.
5405 UndefElts = APInt(NumElts, 0);
5406 EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
5408 for (unsigned i = 0; i != NumElts; ++i) {
5409 unsigned BitOffset = i * EltSizeInBits;
5410 APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
5412 // Only treat an element as UNDEF if all bits are UNDEF.
5413 if (UndefEltBits.isAllOnesValue()) {
5414 if (!AllowWholeUndefs)
5416 UndefElts.setBit(i);
5420 // If only some bits are UNDEF then treat them as zero (or bail if not
5422 if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
5425 APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
5426 EltBits[i] = Bits.getZExtValue();
5431 // Collect constant bits and insert into mask/undef bit masks.
5432 auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
5433 unsigned UndefBitIndex) {
5436 if (isa<UndefValue>(Cst)) {
5437 Undefs.setBit(UndefBitIndex);
5440 if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
5441 Mask = CInt->getValue();
5444 if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
5445 Mask = CFP->getValueAPF().bitcastToAPInt();
5451 // Extract constant bits from build vector.
5452 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
5453 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5454 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5456 APInt UndefSrcElts(NumSrcElts, 0);
5457 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5458 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
5459 const SDValue &Src = Op.getOperand(i);
5460 if (Src.isUndef()) {
5461 UndefSrcElts.setBit(i);
5464 auto *Cst = cast<ConstantSDNode>(Src);
5465 SrcEltBits[i] = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
5467 return CastBitData(UndefSrcElts, SrcEltBits);
5470 // Extract constant bits from constant pool vector.
5471 if (auto *Cst = getTargetConstantFromNode(Op)) {
5472 Type *CstTy = Cst->getType();
5473 if (!CstTy->isVectorTy() || (SizeInBits != CstTy->getPrimitiveSizeInBits()))
5476 unsigned SrcEltSizeInBits = CstTy->getScalarSizeInBits();
5477 unsigned NumSrcElts = CstTy->getVectorNumElements();
5479 APInt UndefSrcElts(NumSrcElts, 0);
5480 SmallVector<APInt, 64> SrcEltBits(NumSrcElts, APInt(SrcEltSizeInBits, 0));
5481 for (unsigned i = 0; i != NumSrcElts; ++i)
5482 if (!CollectConstantBits(Cst->getAggregateElement(i), SrcEltBits[i],
5486 return CastBitData(UndefSrcElts, SrcEltBits);
5489 // Extract constant bits from a broadcasted constant pool scalar.
5490 if (Op.getOpcode() == X86ISD::VBROADCAST &&
5491 EltSizeInBits <= VT.getScalarSizeInBits()) {
5492 if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
5493 unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
5494 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5496 APInt UndefSrcElts(NumSrcElts, 0);
5497 SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
5498 if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
5499 if (UndefSrcElts[0])
5500 UndefSrcElts.setBits(0, NumSrcElts);
5501 SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
5502 return CastBitData(UndefSrcElts, SrcEltBits);
5507 // Extract a rematerialized scalar constant insertion.
5508 if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
5509 Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
5510 isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
5511 unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
5512 unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
5514 APInt UndefSrcElts(NumSrcElts, 0);
5515 SmallVector<APInt, 64> SrcEltBits;
5516 auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
5517 SrcEltBits.push_back(CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits));
5518 SrcEltBits.append(NumSrcElts - 1, APInt(SrcEltSizeInBits, 0));
5519 return CastBitData(UndefSrcElts, SrcEltBits);
5525 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
5526 unsigned MaskEltSizeInBits,
5527 SmallVectorImpl<uint64_t> &RawMask) {
5529 SmallVector<APInt, 64> EltBits;
5531 // Extract the raw target constant bits.
5532 // FIXME: We currently don't support UNDEF bits or mask entries.
5533 if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
5534 EltBits, /* AllowWholeUndefs */ false,
5535 /* AllowPartialUndefs */ false))
5538 // Insert the extracted elements into the mask.
5539 for (APInt Elt : EltBits)
5540 RawMask.push_back(Elt.getZExtValue());
5545 /// Calculates the shuffle mask corresponding to the target-specific opcode.
5546 /// If the mask could be calculated, returns it in \p Mask, returns the shuffle
5547 /// operands in \p Ops, and returns true.
5548 /// Sets \p IsUnary to true if only one source is used. Note that this will set
5549 /// IsUnary for shuffles which use a single input multiple times, and in those
5550 /// cases it will adjust the mask to only have indices within that single input.
5551 /// It is an error to call this with non-empty Mask/Ops vectors.
5552 static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
5553 SmallVectorImpl<SDValue> &Ops,
5554 SmallVectorImpl<int> &Mask, bool &IsUnary) {
5555 unsigned NumElems = VT.getVectorNumElements();
5558 assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
5559 assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
5562 bool IsFakeUnary = false;
5563 switch(N->getOpcode()) {
5564 case X86ISD::BLENDI:
5565 ImmN = N->getOperand(N->getNumOperands()-1);
5566 DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5567 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5570 ImmN = N->getOperand(N->getNumOperands()-1);
5571 DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5572 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5574 case X86ISD::INSERTPS:
5575 ImmN = N->getOperand(N->getNumOperands()-1);
5576 DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5577 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5579 case X86ISD::EXTRQI:
5580 if (isa<ConstantSDNode>(N->getOperand(1)) &&
5581 isa<ConstantSDNode>(N->getOperand(2))) {
5582 int BitLen = N->getConstantOperandVal(1);
5583 int BitIdx = N->getConstantOperandVal(2);
5584 DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask);
5588 case X86ISD::INSERTQI:
5589 if (isa<ConstantSDNode>(N->getOperand(2)) &&
5590 isa<ConstantSDNode>(N->getOperand(3))) {
5591 int BitLen = N->getConstantOperandVal(2);
5592 int BitIdx = N->getConstantOperandVal(3);
5593 DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask);
5594 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5597 case X86ISD::UNPCKH:
5598 DecodeUNPCKHMask(VT, Mask);
5599 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5601 case X86ISD::UNPCKL:
5602 DecodeUNPCKLMask(VT, Mask);
5603 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5605 case X86ISD::MOVHLPS:
5606 DecodeMOVHLPSMask(NumElems, Mask);
5607 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5609 case X86ISD::MOVLHPS:
5610 DecodeMOVLHPSMask(NumElems, Mask);
5611 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5613 case X86ISD::PALIGNR:
5614 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5615 ImmN = N->getOperand(N->getNumOperands()-1);
5616 DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5617 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5618 Ops.push_back(N->getOperand(1));
5619 Ops.push_back(N->getOperand(0));
5621 case X86ISD::VSHLDQ:
5622 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5623 ImmN = N->getOperand(N->getNumOperands() - 1);
5624 DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5627 case X86ISD::VSRLDQ:
5628 assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
5629 ImmN = N->getOperand(N->getNumOperands() - 1);
5630 DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5633 case X86ISD::PSHUFD:
5634 case X86ISD::VPERMILPI:
5635 ImmN = N->getOperand(N->getNumOperands()-1);
5636 DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5639 case X86ISD::PSHUFHW:
5640 ImmN = N->getOperand(N->getNumOperands()-1);
5641 DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5644 case X86ISD::PSHUFLW:
5645 ImmN = N->getOperand(N->getNumOperands()-1);
5646 DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5649 case X86ISD::VZEXT_MOVL:
5650 DecodeZeroMoveLowMask(VT, Mask);
5653 case X86ISD::VBROADCAST: {
5654 SDValue N0 = N->getOperand(0);
5655 // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
5656 // add the pre-extracted value to the Ops vector.
5657 if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
5658 N0.getOperand(0).getValueType() == VT &&
5659 N0.getConstantOperandVal(1) == 0)
5660 Ops.push_back(N0.getOperand(0));
5662 // We only decode broadcasts of same-sized vectors, unless the broadcast
5663 // came from an extract from the original width. If we found one, we
5664 // pushed it the Ops vector above.
5665 if (N0.getValueType() == VT || !Ops.empty()) {
5666 DecodeVectorBroadcast(VT, Mask);
5672 case X86ISD::VPERMILPV: {
5674 SDValue MaskNode = N->getOperand(1);
5675 unsigned MaskEltSize = VT.getScalarSizeInBits();
5676 SmallVector<uint64_t, 32> RawMask;
5677 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5678 DecodeVPERMILPMask(VT, RawMask, Mask);
5681 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5682 DecodeVPERMILPMask(C, MaskEltSize, Mask);
5687 case X86ISD::PSHUFB: {
5689 SDValue MaskNode = N->getOperand(1);
5690 SmallVector<uint64_t, 32> RawMask;
5691 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5692 DecodePSHUFBMask(RawMask, Mask);
5695 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5696 DecodePSHUFBMask(C, Mask);
5701 case X86ISD::VPERMI:
5702 ImmN = N->getOperand(N->getNumOperands()-1);
5703 DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5708 DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
5710 case X86ISD::VPERM2X128:
5711 ImmN = N->getOperand(N->getNumOperands()-1);
5712 DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
5713 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5715 case X86ISD::MOVSLDUP:
5716 DecodeMOVSLDUPMask(VT, Mask);
5719 case X86ISD::MOVSHDUP:
5720 DecodeMOVSHDUPMask(VT, Mask);
5723 case X86ISD::MOVDDUP:
5724 DecodeMOVDDUPMask(VT, Mask);
5727 case X86ISD::MOVLHPD:
5728 case X86ISD::MOVLPD:
5729 case X86ISD::MOVLPS:
5730 // Not yet implemented
5732 case X86ISD::VPERMIL2: {
5733 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5734 unsigned MaskEltSize = VT.getScalarSizeInBits();
5735 SDValue MaskNode = N->getOperand(2);
5736 SDValue CtrlNode = N->getOperand(3);
5737 if (ConstantSDNode *CtrlOp = dyn_cast<ConstantSDNode>(CtrlNode)) {
5738 unsigned CtrlImm = CtrlOp->getZExtValue();
5739 SmallVector<uint64_t, 32> RawMask;
5740 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5741 DecodeVPERMIL2PMask(VT, CtrlImm, RawMask, Mask);
5744 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5745 DecodeVPERMIL2PMask(C, CtrlImm, MaskEltSize, Mask);
5751 case X86ISD::VPPERM: {
5752 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
5753 SDValue MaskNode = N->getOperand(2);
5754 SmallVector<uint64_t, 32> RawMask;
5755 if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) {
5756 DecodeVPPERMMask(RawMask, Mask);
5759 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5760 DecodeVPPERMMask(C, Mask);
5765 case X86ISD::VPERMV: {
5767 // Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
5768 Ops.push_back(N->getOperand(1));
5769 SDValue MaskNode = N->getOperand(0);
5770 SmallVector<uint64_t, 32> RawMask;
5771 unsigned MaskEltSize = VT.getScalarSizeInBits();
5772 if (getTargetShuffleMaskIndices(MaskNode, MaskEltSize, RawMask)) {
5773 DecodeVPERMVMask(RawMask, Mask);
5776 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5777 DecodeVPERMVMask(C, MaskEltSize, Mask);
5782 case X86ISD::VPERMV3: {
5783 IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
5784 // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
5785 Ops.push_back(N->getOperand(0));
5786 Ops.push_back(N->getOperand(2));
5787 SDValue MaskNode = N->getOperand(1);
5788 unsigned MaskEltSize = VT.getScalarSizeInBits();
5789 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5790 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5795 case X86ISD::VPERMIV3: {
5796 IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
5797 // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
5798 Ops.push_back(N->getOperand(1));
5799 Ops.push_back(N->getOperand(2));
5800 SDValue MaskNode = N->getOperand(0);
5801 unsigned MaskEltSize = VT.getScalarSizeInBits();
5802 if (auto *C = getTargetConstantFromNode(MaskNode)) {
5803 DecodeVPERMV3Mask(C, MaskEltSize, Mask);
5808 default: llvm_unreachable("unknown target shuffle node");
5811 // Empty mask indicates the decode failed.
5815 // Check if we're getting a shuffle mask with zero'd elements.
5816 if (!AllowSentinelZero)
5817 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
5820 // If we have a fake unary shuffle, the shuffle mask is spread across two
5821 // inputs that are actually the same node. Re-map the mask to always point
5822 // into the first input.
5825 if (M >= (int)Mask.size())
5828 // If we didn't already add operands in the opcode-specific code, default to
5829 // adding 1 or 2 operands starting at 0.
5831 Ops.push_back(N->getOperand(0));
5832 if (!IsUnary || IsFakeUnary)
5833 Ops.push_back(N->getOperand(1));
5839 /// Check a target shuffle mask's inputs to see if we can set any values to
5840 /// SM_SentinelZero - this is for elements that are known to be zero
5841 /// (not just zeroable) from their inputs.
5842 /// Returns true if the target shuffle mask was decoded.
5843 static bool setTargetShuffleZeroElements(SDValue N,
5844 SmallVectorImpl<int> &Mask,
5845 SmallVectorImpl<SDValue> &Ops) {
5847 if (!isTargetShuffle(N.getOpcode()))
5850 MVT VT = N.getSimpleValueType();
5851 if (!getTargetShuffleMask(N.getNode(), VT, true, Ops, Mask, IsUnary))
5854 SDValue V1 = Ops[0];
5855 SDValue V2 = IsUnary ? V1 : Ops[1];
5857 V1 = peekThroughBitcasts(V1);
5858 V2 = peekThroughBitcasts(V2);
5860 assert((VT.getSizeInBits() % Mask.size()) == 0 &&
5861 "Illegal split of shuffle value type");
5862 unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
5864 // Extract known constant input data.
5865 APInt UndefSrcElts[2];
5866 SmallVector<APInt, 32> SrcEltBits[2];
5867 bool IsSrcConstant[2] = {
5868 getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
5869 SrcEltBits[0], true, false),
5870 getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
5871 SrcEltBits[1], true, false)};
5873 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
5876 // Already decoded as SM_SentinelZero / SM_SentinelUndef.
5880 // Determine shuffle input and normalize the mask.
5881 unsigned SrcIdx = M / Size;
5882 SDValue V = M < Size ? V1 : V2;
5885 // We are referencing an UNDEF input.
5887 Mask[i] = SM_SentinelUndef;
5891 // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
5892 // TODO: We currently only set UNDEF for integer types - floats use the same
5893 // registers as vectors and many of the scalar folded loads rely on the
5894 // SCALAR_TO_VECTOR pattern.
5895 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
5896 (Size % V.getValueType().getVectorNumElements()) == 0) {
5897 int Scale = Size / V.getValueType().getVectorNumElements();
5898 int Idx = M / Scale;
5899 if (Idx != 0 && !VT.isFloatingPoint())
5900 Mask[i] = SM_SentinelUndef;
5901 else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
5902 Mask[i] = SM_SentinelZero;
5906 // Attempt to extract from the source's constant bits.
5907 if (IsSrcConstant[SrcIdx]) {
5908 if (UndefSrcElts[SrcIdx][M])
5909 Mask[i] = SM_SentinelUndef;
5910 else if (SrcEltBits[SrcIdx][M] == 0)
5911 Mask[i] = SM_SentinelZero;
5915 assert(VT.getVectorNumElements() == Mask.size() &&
5916 "Different mask size from vector size!");
5920 // Attempt to decode ops that could be represented as a shuffle mask.
5921 // The decoded shuffle mask may contain a different number of elements to the
5922 // destination value type.
5923 static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
5924 SmallVectorImpl<SDValue> &Ops,
5925 SelectionDAG &DAG) {
5929 MVT VT = N.getSimpleValueType();
5930 unsigned NumElts = VT.getVectorNumElements();
5931 unsigned NumSizeInBits = VT.getSizeInBits();
5932 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
5933 assert((NumBitsPerElt % 8) == 0 && (NumSizeInBits % 8) == 0 &&
5934 "Expected byte aligned value types");
5936 unsigned Opcode = N.getOpcode();
5939 case X86ISD::ANDNP: {
5940 // Attempt to decode as a per-byte mask.
5942 SmallVector<APInt, 32> EltBits;
5943 SDValue N0 = N.getOperand(0);
5944 SDValue N1 = N.getOperand(1);
5945 bool IsAndN = (X86ISD::ANDNP == Opcode);
5946 uint64_t ZeroMask = IsAndN ? 255 : 0;
5947 if (!getTargetConstantBitsFromNode(IsAndN ? N0 : N1, 8, UndefElts, EltBits))
5949 for (int i = 0, e = (int)EltBits.size(); i != e; ++i) {
5951 Mask.push_back(SM_SentinelUndef);
5954 uint64_t ByteBits = EltBits[i].getZExtValue();
5955 if (ByteBits != 0 && ByteBits != 255)
5957 Mask.push_back(ByteBits == ZeroMask ? SM_SentinelZero : i);
5959 Ops.push_back(IsAndN ? N1 : N0);
5962 case ISD::SCALAR_TO_VECTOR: {
5963 // Match against a scalar_to_vector of an extract from a vector,
5964 // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
5965 SDValue N0 = N.getOperand(0);
5968 if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
5969 N0.getOperand(0).getValueType() == VT) {
5971 } else if (N0.getOpcode() == ISD::AssertZext &&
5972 N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
5973 cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
5974 SrcExtract = N0.getOperand(0);
5975 assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
5976 } else if (N0.getOpcode() == ISD::AssertZext &&
5977 N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
5978 cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
5979 SrcExtract = N0.getOperand(0);
5980 assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
5983 if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
5986 SDValue SrcVec = SrcExtract.getOperand(0);
5987 EVT SrcVT = SrcVec.getValueType();
5988 unsigned NumSrcElts = SrcVT.getVectorNumElements();
5989 unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
5991 unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
5992 if (NumSrcElts <= SrcIdx)
5995 Ops.push_back(SrcVec);
5996 Mask.push_back(SrcIdx);
5997 Mask.append(NumZeros, SM_SentinelZero);
5998 Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
6001 case X86ISD::PINSRB:
6002 case X86ISD::PINSRW: {
6003 SDValue InVec = N.getOperand(0);
6004 SDValue InScl = N.getOperand(1);
6005 uint64_t InIdx = N.getConstantOperandVal(2);
6006 assert(InIdx < NumElts && "Illegal insertion index");
6008 // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
6009 if (X86::isZeroNode(InScl)) {
6010 Ops.push_back(InVec);
6011 for (unsigned i = 0; i != NumElts; ++i)
6012 Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
6016 // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
6017 // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
6019 (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
6020 if (InScl.getOpcode() != ISD::AssertZext ||
6021 InScl.getOperand(0).getOpcode() != ExOp)
6024 SDValue ExVec = InScl.getOperand(0).getOperand(0);
6025 uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
6026 assert(ExIdx < NumElts && "Illegal extraction index");
6027 Ops.push_back(InVec);
6028 Ops.push_back(ExVec);
6029 for (unsigned i = 0; i != NumElts; ++i)
6030 Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
6033 case X86ISD::PACKSS: {
6034 // If we know input saturation won't happen we can treat this
6035 // as a truncation shuffle.
6036 if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
6037 DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
6040 Ops.push_back(N.getOperand(0));
6041 Ops.push_back(N.getOperand(1));
6042 for (unsigned i = 0; i != NumElts; ++i)
6043 Mask.push_back(i * 2);
6047 case X86ISD::VSRLI: {
6048 uint64_t ShiftVal = N.getConstantOperandVal(1);
6049 // Out of range bit shifts are guaranteed to be zero.
6050 if (NumBitsPerElt <= ShiftVal) {
6051 Mask.append(NumElts, SM_SentinelZero);
6055 // We can only decode 'whole byte' bit shifts as shuffles.
6056 if ((ShiftVal % 8) != 0)
6059 uint64_t ByteShift = ShiftVal / 8;
6060 unsigned NumBytes = NumSizeInBits / 8;
6061 unsigned NumBytesPerElt = NumBitsPerElt / 8;
6062 Ops.push_back(N.getOperand(0));
6064 // Clear mask to all zeros and insert the shifted byte indices.
6065 Mask.append(NumBytes, SM_SentinelZero);
6067 if (X86ISD::VSHLI == Opcode) {
6068 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6069 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6070 Mask[i + j] = i + j - ByteShift;
6072 for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
6073 for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
6074 Mask[i + j - ByteShift] = i + j;
6078 case ISD::ZERO_EXTEND_VECTOR_INREG:
6079 case X86ISD::VZEXT: {
6080 // TODO - add support for VPMOVZX with smaller input vector types.
6081 SDValue Src = N.getOperand(0);
6082 MVT SrcVT = Src.getSimpleValueType();
6083 if (NumSizeInBits != SrcVT.getSizeInBits())
6085 DecodeZeroExtendMask(SrcVT.getScalarType(), VT, Mask);
6094 /// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
6095 static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
6096 SmallVectorImpl<int> &Mask) {
6097 int MaskWidth = Mask.size();
6098 SmallVector<SDValue, 16> UsedInputs;
6099 for (int i = 0, e = Inputs.size(); i < e; ++i) {
6100 int lo = UsedInputs.size() * MaskWidth;
6101 int hi = lo + MaskWidth;
6102 if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
6103 UsedInputs.push_back(Inputs[i]);
6110 Inputs = UsedInputs;
6113 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
6114 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
6115 /// remaining input indices in case we now have a unary shuffle and adjust the
6116 /// inputs accordingly.
6117 /// Returns true if the target shuffle mask was decoded.
6118 static bool resolveTargetShuffleInputs(SDValue Op,
6119 SmallVectorImpl<SDValue> &Inputs,
6120 SmallVectorImpl<int> &Mask,
6121 SelectionDAG &DAG) {
6122 if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
6123 if (!getFauxShuffleMask(Op, Mask, Inputs, DAG))
6126 resolveTargetShuffleInputsAndMask(Inputs, Mask);
6130 /// Returns the scalar element that will make up the ith
6131 /// element of the result of the vector shuffle.
6132 static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
6135 return SDValue(); // Limit search depth.
6137 SDValue V = SDValue(N, 0);
6138 EVT VT = V.getValueType();
6139 unsigned Opcode = V.getOpcode();
6141 // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
6142 if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
6143 int Elt = SV->getMaskElt(Index);
6146 return DAG.getUNDEF(VT.getVectorElementType());
6148 unsigned NumElems = VT.getVectorNumElements();
6149 SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
6150 : SV->getOperand(1);
6151 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
6154 // Recurse into target specific vector shuffles to find scalars.
6155 if (isTargetShuffle(Opcode)) {
6156 MVT ShufVT = V.getSimpleValueType();
6157 MVT ShufSVT = ShufVT.getVectorElementType();
6158 int NumElems = (int)ShufVT.getVectorNumElements();
6159 SmallVector<int, 16> ShuffleMask;
6160 SmallVector<SDValue, 16> ShuffleOps;
6163 if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
6166 int Elt = ShuffleMask[Index];
6167 if (Elt == SM_SentinelZero)
6168 return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
6169 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
6170 if (Elt == SM_SentinelUndef)
6171 return DAG.getUNDEF(ShufSVT);
6173 assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
6174 SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
6175 return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
6179 // Actual nodes that may contain scalar elements
6180 if (Opcode == ISD::BITCAST) {
6181 V = V.getOperand(0);
6182 EVT SrcVT = V.getValueType();
6183 unsigned NumElems = VT.getVectorNumElements();
6185 if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
6189 if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
6190 return (Index == 0) ? V.getOperand(0)
6191 : DAG.getUNDEF(VT.getVectorElementType());
6193 if (V.getOpcode() == ISD::BUILD_VECTOR)
6194 return V.getOperand(Index);
6199 /// Custom lower build_vector of v16i8.
6200 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
6201 unsigned NumNonZero, unsigned NumZero,
6203 const X86Subtarget &Subtarget) {
6204 if (NumNonZero > 8 && !Subtarget.hasSSE41())
6211 // SSE4.1 - use PINSRB to insert each byte directly.
6212 if (Subtarget.hasSSE41()) {
6213 for (unsigned i = 0; i < 16; ++i) {
6214 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6216 // If the build vector contains zeros or our first insertion is not the
6217 // first index then insert into zero vector to break any register
6218 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6221 if (NumZero || 0 != i)
6222 V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
6224 assert(0 == i && "Expected insertion into zero-index");
6225 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6226 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6227 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6228 V = DAG.getBitcast(MVT::v16i8, V);
6232 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
6233 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6240 // Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
6241 for (unsigned i = 0; i < 16; ++i) {
6242 bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
6243 if (ThisIsNonZero && First) {
6245 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6247 V = DAG.getUNDEF(MVT::v8i16);
6252 // FIXME: Investigate extending to i32 instead of just i16.
6253 // FIXME: Investigate combining the first 4 bytes as a i32 instead.
6254 SDValue ThisElt, LastElt;
6255 bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
6256 if (LastIsNonZero) {
6258 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
6260 if (ThisIsNonZero) {
6261 ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
6262 ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
6263 DAG.getConstant(8, dl, MVT::i8));
6265 ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
6271 V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
6272 : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
6273 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6274 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6275 V = DAG.getBitcast(MVT::v8i16, V);
6277 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
6278 DAG.getIntPtrConstant(i / 2, dl));
6284 return DAG.getBitcast(MVT::v16i8, V);
6287 /// Custom lower build_vector of v8i16.
6288 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
6289 unsigned NumNonZero, unsigned NumZero,
6291 const X86Subtarget &Subtarget) {
6292 if (NumNonZero > 4 && !Subtarget.hasSSE41())
6298 for (unsigned i = 0; i < 8; ++i) {
6299 bool IsNonZero = (NonZeros & (1 << i)) != 0;
6301 // If the build vector contains zeros or our first insertion is not the
6302 // first index then insert into zero vector to break any register
6303 // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
6306 if (NumZero || 0 != i)
6307 V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
6309 assert(0 == i && "Expected insertion into zero-index");
6310 V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
6311 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
6312 V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
6313 V = DAG.getBitcast(MVT::v8i16, V);
6317 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
6318 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
6325 /// Custom lower build_vector of v4i32 or v4f32.
6326 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
6327 const X86Subtarget &Subtarget) {
6328 // Find all zeroable elements.
6329 std::bitset<4> Zeroable;
6330 for (int i=0; i < 4; ++i) {
6331 SDValue Elt = Op->getOperand(i);
6332 Zeroable[i] = (Elt.isUndef() || X86::isZeroNode(Elt));
6334 assert(Zeroable.size() - Zeroable.count() > 1 &&
6335 "We expect at least two non-zero elements!");
6337 // We only know how to deal with build_vector nodes where elements are either
6338 // zeroable or extract_vector_elt with constant index.
6339 SDValue FirstNonZero;
6340 unsigned FirstNonZeroIdx;
6341 for (unsigned i=0; i < 4; ++i) {
6344 SDValue Elt = Op->getOperand(i);
6345 if (Elt.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
6346 !isa<ConstantSDNode>(Elt.getOperand(1)))
6348 // Make sure that this node is extracting from a 128-bit vector.
6349 MVT VT = Elt.getOperand(0).getSimpleValueType();
6350 if (!VT.is128BitVector())
6352 if (!FirstNonZero.getNode()) {
6354 FirstNonZeroIdx = i;
6358 assert(FirstNonZero.getNode() && "Unexpected build vector of all zeros!");
6359 SDValue V1 = FirstNonZero.getOperand(0);
6360 MVT VT = V1.getSimpleValueType();
6362 // See if this build_vector can be lowered as a blend with zero.
6364 unsigned EltMaskIdx, EltIdx;
6366 for (EltIdx = 0; EltIdx < 4; ++EltIdx) {
6367 if (Zeroable[EltIdx]) {
6368 // The zero vector will be on the right hand side.
6369 Mask[EltIdx] = EltIdx+4;
6373 Elt = Op->getOperand(EltIdx);
6374 // By construction, Elt is a EXTRACT_VECTOR_ELT with constant index.
6375 EltMaskIdx = Elt.getConstantOperandVal(1);
6376 if (Elt.getOperand(0) != V1 || EltMaskIdx != EltIdx)
6378 Mask[EltIdx] = EltIdx;
6382 // Let the shuffle legalizer deal with blend operations.
6383 SDValue VZero = getZeroVector(VT, Subtarget, DAG, SDLoc(Op));
6384 if (V1.getSimpleValueType() != VT)
6385 V1 = DAG.getBitcast(VT, V1);
6386 return DAG.getVectorShuffle(VT, SDLoc(V1), V1, VZero, Mask);
6389 // See if we can lower this build_vector to a INSERTPS.
6390 if (!Subtarget.hasSSE41())
6393 SDValue V2 = Elt.getOperand(0);
6394 if (Elt == FirstNonZero && EltIdx == FirstNonZeroIdx)
6397 bool CanFold = true;
6398 for (unsigned i = EltIdx + 1; i < 4 && CanFold; ++i) {
6402 SDValue Current = Op->getOperand(i);
6403 SDValue SrcVector = Current->getOperand(0);
6406 CanFold = (SrcVector == V1) && (Current.getConstantOperandVal(1) == i);
6412 assert(V1.getNode() && "Expected at least two non-zero elements!");
6413 if (V1.getSimpleValueType() != MVT::v4f32)
6414 V1 = DAG.getBitcast(MVT::v4f32, V1);
6415 if (V2.getSimpleValueType() != MVT::v4f32)
6416 V2 = DAG.getBitcast(MVT::v4f32, V2);
6418 // Ok, we can emit an INSERTPS instruction.
6419 unsigned ZMask = Zeroable.to_ulong();
6421 unsigned InsertPSMask = EltMaskIdx << 6 | EltIdx << 4 | ZMask;
6422 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
6424 SDValue Result = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
6425 DAG.getIntPtrConstant(InsertPSMask, DL));
6426 return DAG.getBitcast(VT, Result);
6429 /// Return a vector logical shift node.
6430 static SDValue getVShift(bool isLeft, EVT VT, SDValue SrcOp, unsigned NumBits,
6431 SelectionDAG &DAG, const TargetLowering &TLI,
6433 assert(VT.is128BitVector() && "Unknown type for VShift");
6434 MVT ShVT = MVT::v16i8;
6435 unsigned Opc = isLeft ? X86ISD::VSHLDQ : X86ISD::VSRLDQ;
6436 SrcOp = DAG.getBitcast(ShVT, SrcOp);
6437 MVT ScalarShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), VT);
6438 assert(NumBits % 8 == 0 && "Only support byte sized shifts");
6439 SDValue ShiftVal = DAG.getConstant(NumBits/8, dl, ScalarShiftTy);
6440 return DAG.getBitcast(VT, DAG.getNode(Opc, dl, ShVT, SrcOp, ShiftVal));
6443 static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
6444 SelectionDAG &DAG) {
6446 // Check if the scalar load can be widened into a vector load. And if
6447 // the address is "base + cst" see if the cst can be "absorbed" into
6448 // the shuffle mask.
6449 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(SrcOp)) {
6450 SDValue Ptr = LD->getBasePtr();
6451 if (!ISD::isNormalLoad(LD) || LD->isVolatile())
6453 EVT PVT = LD->getValueType(0);
6454 if (PVT != MVT::i32 && PVT != MVT::f32)
6459 if (FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr)) {
6460 FI = FINode->getIndex();
6462 } else if (DAG.isBaseWithConstantOffset(Ptr) &&
6463 isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
6464 FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
6465 Offset = Ptr.getConstantOperandVal(1);
6466 Ptr = Ptr.getOperand(0);
6471 // FIXME: 256-bit vector instructions don't require a strict alignment,
6472 // improve this code to support it better.
6473 unsigned RequiredAlign = VT.getSizeInBits()/8;
6474 SDValue Chain = LD->getChain();
6475 // Make sure the stack object alignment is at least 16 or 32.
6476 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
6477 if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
6478 if (MFI.isFixedObjectIndex(FI)) {
6479 // Can't change the alignment. FIXME: It's possible to compute
6480 // the exact stack offset and reference FI + adjust offset instead.
6481 // If someone *really* cares about this. That's the way to implement it.
6484 MFI.setObjectAlignment(FI, RequiredAlign);
6488 // (Offset % 16 or 32) must be multiple of 4. Then address is then
6489 // Ptr + (Offset & ~15).
6492 if ((Offset % RequiredAlign) & 3)
6494 int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
6497 Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
6498 DAG.getConstant(StartOffset, DL, Ptr.getValueType()));
6501 int EltNo = (Offset - StartOffset) >> 2;
6502 unsigned NumElems = VT.getVectorNumElements();
6504 EVT NVT = EVT::getVectorVT(*DAG.getContext(), PVT, NumElems);
6505 SDValue V1 = DAG.getLoad(NVT, dl, Chain, Ptr,
6506 LD->getPointerInfo().getWithOffset(StartOffset));
6508 SmallVector<int, 8> Mask(NumElems, EltNo);
6510 return DAG.getVectorShuffle(NVT, dl, V1, DAG.getUNDEF(NVT), Mask);
6516 /// Given the initializing elements 'Elts' of a vector of type 'VT', see if the
6517 /// elements can be replaced by a single large load which has the same value as
6518 /// a build_vector or insert_subvector whose loaded operands are 'Elts'.
6520 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
6521 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
6522 const SDLoc &DL, SelectionDAG &DAG,
6523 const X86Subtarget &Subtarget,
6524 bool isAfterLegalize) {
6525 unsigned NumElems = Elts.size();
6527 int LastLoadedElt = -1;
6528 SmallBitVector LoadMask(NumElems, false);
6529 SmallBitVector ZeroMask(NumElems, false);
6530 SmallBitVector UndefMask(NumElems, false);
6532 // For each element in the initializer, see if we've found a load, zero or an
6534 for (unsigned i = 0; i < NumElems; ++i) {
6535 SDValue Elt = peekThroughBitcasts(Elts[i]);
6540 UndefMask[i] = true;
6541 else if (X86::isZeroNode(Elt) || ISD::isBuildVectorAllZeros(Elt.getNode()))
6543 else if (ISD::isNON_EXTLoad(Elt.getNode())) {
6546 // Each loaded element must be the correct fractional portion of the
6547 // requested vector load.
6548 if ((NumElems * Elt.getValueSizeInBits()) != VT.getSizeInBits())
6553 assert((ZeroMask | UndefMask | LoadMask).count() == NumElems &&
6554 "Incomplete element masks");
6556 // Handle Special Cases - all undef or undef/zero.
6557 if (UndefMask.count() == NumElems)
6558 return DAG.getUNDEF(VT);
6560 // FIXME: Should we return this as a BUILD_VECTOR instead?
6561 if ((ZeroMask | UndefMask).count() == NumElems)
6562 return VT.isInteger() ? DAG.getConstant(0, DL, VT)
6563 : DAG.getConstantFP(0.0, DL, VT);
6565 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6566 int FirstLoadedElt = LoadMask.find_first();
6567 SDValue EltBase = peekThroughBitcasts(Elts[FirstLoadedElt]);
6568 LoadSDNode *LDBase = cast<LoadSDNode>(EltBase);
6569 EVT LDBaseVT = EltBase.getValueType();
6571 // Consecutive loads can contain UNDEFS but not ZERO elements.
6572 // Consecutive loads with UNDEFs and ZEROs elements require a
6573 // an additional shuffle stage to clear the ZERO elements.
6574 bool IsConsecutiveLoad = true;
6575 bool IsConsecutiveLoadWithZeros = true;
6576 for (int i = FirstLoadedElt + 1; i <= LastLoadedElt; ++i) {
6578 SDValue Elt = peekThroughBitcasts(Elts[i]);
6579 LoadSDNode *LD = cast<LoadSDNode>(Elt);
6580 if (!DAG.areNonVolatileConsecutiveLoads(
6581 LD, LDBase, Elt.getValueType().getStoreSizeInBits() / 8,
6582 i - FirstLoadedElt)) {
6583 IsConsecutiveLoad = false;
6584 IsConsecutiveLoadWithZeros = false;
6587 } else if (ZeroMask[i]) {
6588 IsConsecutiveLoad = false;
6592 auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
6593 auto MMOFlags = LDBase->getMemOperand()->getFlags();
6594 assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
6595 "Cannot merge volatile loads.");
6597 DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
6598 LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
6599 DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
6603 // LOAD - all consecutive load/undefs (must start/end with a load).
6604 // If we have found an entire vector of loads and undefs, then return a large
6605 // load of the entire vector width starting at the base pointer.
6606 // If the vector contains zeros, then attempt to shuffle those elements.
6607 if (FirstLoadedElt == 0 && LastLoadedElt == (int)(NumElems - 1) &&
6608 (IsConsecutiveLoad || IsConsecutiveLoadWithZeros)) {
6609 assert(LDBase && "Did not find base load for merging consecutive loads");
6610 EVT EltVT = LDBase->getValueType(0);
6611 // Ensure that the input vector size for the merged loads matches the
6612 // cumulative size of the input elements.
6613 if (VT.getSizeInBits() != EltVT.getSizeInBits() * NumElems)
6616 if (isAfterLegalize && !TLI.isOperationLegal(ISD::LOAD, VT))
6619 // Don't create 256-bit non-temporal aligned loads without AVX2 as these
6620 // will lower to regular temporal loads and use the cache.
6621 if (LDBase->isNonTemporal() && LDBase->getAlignment() >= 32 &&
6622 VT.is256BitVector() && !Subtarget.hasInt256())
6625 if (IsConsecutiveLoad)
6626 return CreateLoad(VT, LDBase);
6628 // IsConsecutiveLoadWithZeros - we need to create a shuffle of the loaded
6629 // vector and a zero vector to clear out the zero elements.
6630 if (!isAfterLegalize && NumElems == VT.getVectorNumElements()) {
6631 SmallVector<int, 4> ClearMask(NumElems, -1);
6632 for (unsigned i = 0; i < NumElems; ++i) {
6634 ClearMask[i] = i + NumElems;
6635 else if (LoadMask[i])
6638 SDValue V = CreateLoad(VT, LDBase);
6639 SDValue Z = VT.isInteger() ? DAG.getConstant(0, DL, VT)
6640 : DAG.getConstantFP(0.0, DL, VT);
6641 return DAG.getVectorShuffle(VT, DL, V, Z, ClearMask);
6646 (1 + LastLoadedElt - FirstLoadedElt) * LDBaseVT.getStoreSizeInBits();
6648 // VZEXT_LOAD - consecutive 32/64-bit load/undefs followed by zeros/undefs.
6649 if (IsConsecutiveLoad && FirstLoadedElt == 0 &&
6650 (LoadSize == 32 || LoadSize == 64) &&
6651 ((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()))) {
6652 MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSize)
6653 : MVT::getIntegerVT(LoadSize);
6654 MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSize);
6655 if (TLI.isTypeLegal(VecVT)) {
6656 SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
6657 SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
6659 DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
6660 LDBase->getPointerInfo(),
6661 LDBase->getAlignment(),
6662 false/*isVolatile*/, true/*ReadMem*/,
6664 DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
6665 return DAG.getBitcast(VT, ResNode);
6672 static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
6673 unsigned SplatBitSize, LLVMContext &C) {
6674 unsigned ScalarSize = VT.getScalarSizeInBits();
6675 unsigned NumElm = SplatBitSize / ScalarSize;
6677 SmallVector<Constant *, 32> ConstantVec;
6678 for (unsigned i = 0; i < NumElm; i++) {
6679 APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
6681 if (VT.isFloatingPoint()) {
6682 if (ScalarSize == 32) {
6683 Const = ConstantFP::get(C, APFloat(APFloat::IEEEsingle(), Val));
6685 assert(ScalarSize == 64 && "Unsupported floating point scalar size");
6686 Const = ConstantFP::get(C, APFloat(APFloat::IEEEdouble(), Val));
6689 Const = Constant::getIntegerValue(Type::getIntNTy(C, ScalarSize), Val);
6690 ConstantVec.push_back(Const);
6692 return ConstantVector::get(ArrayRef<Constant *>(ConstantVec));
6695 static bool isUseOfShuffle(SDNode *N) {
6696 for (auto *U : N->uses()) {
6697 if (isTargetShuffle(U->getOpcode()))
6699 if (U->getOpcode() == ISD::BITCAST) // Ignore bitcasts
6700 return isUseOfShuffle(U);
6705 /// Attempt to use the vbroadcast instruction to generate a splat value
6706 /// from a splat BUILD_VECTOR which uses:
6707 /// a. A single scalar load, or a constant.
6708 /// b. Repeated pattern of constants (e.g. <0,1,0,1> or <0,1,2,3,0,1,2,3>).
6710 /// The VBROADCAST node is returned when a pattern is found,
6711 /// or SDValue() otherwise.
6712 static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
6713 const X86Subtarget &Subtarget,
6714 SelectionDAG &DAG) {
6715 // VBROADCAST requires AVX.
6716 // TODO: Splats could be generated for non-AVX CPUs using SSE
6717 // instructions, but there's less potential gain for only 128-bit vectors.
6718 if (!Subtarget.hasAVX())
6721 MVT VT = BVOp->getSimpleValueType(0);
6724 assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
6725 "Unsupported vector type for broadcast.");
6727 BitVector UndefElements;
6728 SDValue Ld = BVOp->getSplatValue(&UndefElements);
6730 // We need a splat of a single value to use broadcast, and it doesn't
6731 // make any sense if the value is only in one element of the vector.
6732 if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
6733 APInt SplatValue, Undef;
6734 unsigned SplatBitSize;
6736 // Check if this is a repeated constant pattern suitable for broadcasting.
6737 if (BVOp->isConstantSplat(SplatValue, Undef, SplatBitSize, HasUndef) &&
6738 SplatBitSize > VT.getScalarSizeInBits() &&
6739 SplatBitSize < VT.getSizeInBits()) {
6740 // Avoid replacing with broadcast when it's a use of a shuffle
6741 // instruction to preserve the present custom lowering of shuffles.
6742 if (isUseOfShuffle(BVOp) || BVOp->hasOneUse())
6744 // replace BUILD_VECTOR with broadcast of the repeated constants.
6745 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6746 LLVMContext *Ctx = DAG.getContext();
6747 MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
6748 if (Subtarget.hasAVX()) {
6749 if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
6750 !(SplatBitSize == 64 && Subtarget.is32Bit())) {
6751 // Splatted value can fit in one INTEGER constant in constant pool.
6752 // Load the constant and broadcast it.
6753 MVT CVT = MVT::getIntegerVT(SplatBitSize);
6754 Type *ScalarTy = Type::getIntNTy(*Ctx, SplatBitSize);
6755 Constant *C = Constant::getIntegerValue(ScalarTy, SplatValue);
6756 SDValue CP = DAG.getConstantPool(C, PVT);
6757 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6759 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6761 CVT, dl, DAG.getEntryNode(), CP,
6762 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6764 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6765 MVT::getVectorVT(CVT, Repeat), Ld);
6766 return DAG.getBitcast(VT, Brdcst);
6767 } else if (SplatBitSize == 32 || SplatBitSize == 64) {
6768 // Splatted value can fit in one FLOAT constant in constant pool.
6769 // Load the constant and broadcast it.
6770 // AVX have support for 32 and 64 bit broadcast for floats only.
6771 // No 64bit integer in 32bit subtarget.
6772 MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
6773 // Lower the splat via APFloat directly, to avoid any conversion.
6776 ? ConstantFP::get(*Ctx,
6777 APFloat(APFloat::IEEEsingle(), SplatValue))
6778 : ConstantFP::get(*Ctx,
6779 APFloat(APFloat::IEEEdouble(), SplatValue));
6780 SDValue CP = DAG.getConstantPool(C, PVT);
6781 unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
6783 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6785 CVT, dl, DAG.getEntryNode(), CP,
6786 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6788 SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
6789 MVT::getVectorVT(CVT, Repeat), Ld);
6790 return DAG.getBitcast(VT, Brdcst);
6791 } else if (SplatBitSize > 64) {
6792 // Load the vector of constants and broadcast it.
6793 MVT CVT = VT.getScalarType();
6794 Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
6796 SDValue VCP = DAG.getConstantPool(VecC, PVT);
6797 unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
6798 unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
6800 MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
6801 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6803 SDValue Brdcst = DAG.getNode(X86ISD::SUBV_BROADCAST, dl, VT, Ld);
6804 return DAG.getBitcast(VT, Brdcst);
6811 bool ConstSplatVal =
6812 (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
6814 // Make sure that all of the users of a non-constant load are from the
6815 // BUILD_VECTOR node.
6816 if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
6819 unsigned ScalarSize = Ld.getValueSizeInBits();
6820 bool IsGE256 = (VT.getSizeInBits() >= 256);
6822 // When optimizing for size, generate up to 5 extra bytes for a broadcast
6823 // instruction to save 8 or more bytes of constant pool data.
6824 // TODO: If multiple splats are generated to load the same constant,
6825 // it may be detrimental to overall size. There needs to be a way to detect
6826 // that condition to know if this is truly a size win.
6827 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
6829 // Handle broadcasting a single constant scalar from the constant pool
6831 // On Sandybridge (no AVX2), it is still better to load a constant vector
6832 // from the constant pool and not to broadcast it from a scalar.
6833 // But override that restriction when optimizing for size.
6834 // TODO: Check if splatting is recommended for other AVX-capable CPUs.
6835 if (ConstSplatVal && (Subtarget.hasAVX2() || OptForSize)) {
6836 EVT CVT = Ld.getValueType();
6837 assert(!CVT.isVector() && "Must not broadcast a vector type");
6839 // Splat f32, i32, v4f64, v4i64 in all cases with AVX2.
6840 // For size optimization, also splat v2f64 and v2i64, and for size opt
6841 // with AVX2, also splat i8 and i16.
6842 // With pattern matching, the VBROADCAST node may become a VMOVDDUP.
6843 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6844 (OptForSize && (ScalarSize == 64 || Subtarget.hasAVX2()))) {
6845 const Constant *C = nullptr;
6846 if (ConstantSDNode *CI = dyn_cast<ConstantSDNode>(Ld))
6847 C = CI->getConstantIntValue();
6848 else if (ConstantFPSDNode *CF = dyn_cast<ConstantFPSDNode>(Ld))
6849 C = CF->getConstantFPValue();
6851 assert(C && "Invalid constant type");
6853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6855 DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
6856 unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
6858 CVT, dl, DAG.getEntryNode(), CP,
6859 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
6862 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6866 bool IsLoad = ISD::isNormalLoad(Ld.getNode());
6868 // Handle AVX2 in-register broadcasts.
6869 if (!IsLoad && Subtarget.hasInt256() &&
6870 (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
6871 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6873 // The scalar source must be a normal load.
6877 if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
6878 (Subtarget.hasVLX() && ScalarSize == 64))
6879 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6881 // The integer check is needed for the 64-bit into 128-bit so it doesn't match
6882 // double since there is no vbroadcastsd xmm
6883 if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
6884 if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
6885 return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
6888 // Unsupported broadcast.
6892 /// \brief For an EXTRACT_VECTOR_ELT with a constant index return the real
6893 /// underlying vector and index.
6895 /// Modifies \p ExtractedFromVec to the real vector and returns the real
6897 static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
6899 int Idx = cast<ConstantSDNode>(ExtIdx)->getZExtValue();
6900 if (!isa<ShuffleVectorSDNode>(ExtractedFromVec))
6903 // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
6905 // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
6907 // (extract_vector_elt (vector_shuffle<2,u,u,u>
6908 // (extract_subvector (v8f32 %vreg0), Constant<4>),
6911 // In this case the vector is the extract_subvector expression and the index
6912 // is 2, as specified by the shuffle.
6913 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(ExtractedFromVec);
6914 SDValue ShuffleVec = SVOp->getOperand(0);
6915 MVT ShuffleVecVT = ShuffleVec.getSimpleValueType();
6916 assert(ShuffleVecVT.getVectorElementType() ==
6917 ExtractedFromVec.getSimpleValueType().getVectorElementType());
6919 int ShuffleIdx = SVOp->getMaskElt(Idx);
6920 if (isUndefOrInRange(ShuffleIdx, 0, ShuffleVecVT.getVectorNumElements())) {
6921 ExtractedFromVec = ShuffleVec;
6927 static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
6928 MVT VT = Op.getSimpleValueType();
6930 // Skip if insert_vec_elt is not supported.
6931 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6932 if (!TLI.isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT))
6936 unsigned NumElems = Op.getNumOperands();
6940 SmallVector<unsigned, 4> InsertIndices;
6941 SmallVector<int, 8> Mask(NumElems, -1);
6943 for (unsigned i = 0; i != NumElems; ++i) {
6944 unsigned Opc = Op.getOperand(i).getOpcode();
6946 if (Opc == ISD::UNDEF)
6949 if (Opc != ISD::EXTRACT_VECTOR_ELT) {
6950 // Quit if more than 1 elements need inserting.
6951 if (InsertIndices.size() > 1)
6954 InsertIndices.push_back(i);
6958 SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
6959 SDValue ExtIdx = Op.getOperand(i).getOperand(1);
6961 // Quit if non-constant index.
6962 if (!isa<ConstantSDNode>(ExtIdx))
6964 int Idx = getUnderlyingExtractedFromVec(ExtractedFromVec, ExtIdx);
6966 // Quit if extracted from vector of different type.
6967 if (ExtractedFromVec.getValueType() != VT)
6970 if (!VecIn1.getNode())
6971 VecIn1 = ExtractedFromVec;
6972 else if (VecIn1 != ExtractedFromVec) {
6973 if (!VecIn2.getNode())
6974 VecIn2 = ExtractedFromVec;
6975 else if (VecIn2 != ExtractedFromVec)
6976 // Quit if more than 2 vectors to shuffle
6980 if (ExtractedFromVec == VecIn1)
6982 else if (ExtractedFromVec == VecIn2)
6983 Mask[i] = Idx + NumElems;
6986 if (!VecIn1.getNode())
6989 VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
6990 SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
6992 for (unsigned Idx : InsertIndices)
6993 NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
6994 DAG.getIntPtrConstant(Idx, DL));
6999 static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
7000 assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
7001 Op.getScalarValueSizeInBits() == 1 &&
7002 "Can not convert non-constant vector");
7003 uint64_t Immediate = 0;
7004 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7005 SDValue In = Op.getOperand(idx);
7007 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7010 MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
7011 return DAG.getConstant(Immediate, dl, VT);
7013 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
7015 X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
7017 MVT VT = Op.getSimpleValueType();
7018 assert((VT.getVectorElementType() == MVT::i1) &&
7019 "Unexpected type in LowerBUILD_VECTORvXi1!");
7022 if (ISD::isBuildVectorAllZeros(Op.getNode()))
7023 return DAG.getTargetConstant(0, dl, VT);
7025 if (ISD::isBuildVectorAllOnes(Op.getNode()))
7026 return DAG.getTargetConstant(1, dl, VT);
7028 if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
7029 if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
7030 // Split the pieces.
7032 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(0, 32));
7034 DAG.getBuildVector(MVT::v32i1, dl, Op.getNode()->ops().slice(32, 32));
7035 // We have to manually lower both halves so getNode doesn't try to
7036 // reassemble the build_vector.
7037 Lower = LowerBUILD_VECTORvXi1(Lower, DAG);
7038 Upper = LowerBUILD_VECTORvXi1(Upper, DAG);
7039 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lower, Upper);
7041 SDValue Imm = ConvertI1VectorToInteger(Op, DAG);
7042 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7043 return DAG.getBitcast(VT, Imm);
7044 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7045 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7046 DAG.getIntPtrConstant(0, dl));
7049 // Vector has one or more non-const elements
7050 uint64_t Immediate = 0;
7051 SmallVector<unsigned, 16> NonConstIdx;
7052 bool IsSplat = true;
7053 bool HasConstElts = false;
7055 for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
7056 SDValue In = Op.getOperand(idx);
7059 if (!isa<ConstantSDNode>(In))
7060 NonConstIdx.push_back(idx);
7062 Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
7063 HasConstElts = true;
7067 else if (In != Op.getOperand(SplatIdx))
7071 // for splat use " (select i1 splat_elt, all-ones, all-zeroes)"
7073 return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx),
7074 DAG.getConstant(1, dl, VT),
7075 DAG.getConstant(0, dl, VT));
7077 // insert elements one by one
7081 MVT ImmVT = MVT::getIntegerVT(std::max((int)VT.getSizeInBits(), 8));
7082 Imm = DAG.getConstant(Immediate, dl, ImmVT);
7084 else if (HasConstElts)
7085 Imm = DAG.getConstant(0, dl, VT);
7087 Imm = DAG.getUNDEF(VT);
7088 if (Imm.getValueSizeInBits() == VT.getSizeInBits())
7089 DstVec = DAG.getBitcast(VT, Imm);
7091 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, Imm);
7092 DstVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
7093 DAG.getIntPtrConstant(0, dl));
7096 for (unsigned i = 0, e = NonConstIdx.size(); i != e; ++i) {
7097 unsigned InsertIdx = NonConstIdx[i];
7098 DstVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DstVec,
7099 Op.getOperand(InsertIdx),
7100 DAG.getIntPtrConstant(InsertIdx, dl));
7105 /// \brief Return true if \p N implements a horizontal binop and return the
7106 /// operands for the horizontal binop into V0 and V1.
7108 /// This is a helper function of LowerToHorizontalOp().
7109 /// This function checks that the build_vector \p N in input implements a
7110 /// horizontal operation. Parameter \p Opcode defines the kind of horizontal
7111 /// operation to match.
7112 /// For example, if \p Opcode is equal to ISD::ADD, then this function
7113 /// checks if \p N implements a horizontal arithmetic add; if instead \p Opcode
7114 /// is equal to ISD::SUB, then this function checks if this is a horizontal
7117 /// This function only analyzes elements of \p N whose indices are
7118 /// in range [BaseIdx, LastIdx).
7119 static bool isHorizontalBinOp(const BuildVectorSDNode *N, unsigned Opcode,
7121 unsigned BaseIdx, unsigned LastIdx,
7122 SDValue &V0, SDValue &V1) {
7123 EVT VT = N->getValueType(0);
7125 assert(BaseIdx * 2 <= LastIdx && "Invalid Indices in input!");
7126 assert(VT.isVector() && VT.getVectorNumElements() >= LastIdx &&
7127 "Invalid Vector in input!");
7129 bool IsCommutable = (Opcode == ISD::ADD || Opcode == ISD::FADD);
7130 bool CanFold = true;
7131 unsigned ExpectedVExtractIdx = BaseIdx;
7132 unsigned NumElts = LastIdx - BaseIdx;
7133 V0 = DAG.getUNDEF(VT);
7134 V1 = DAG.getUNDEF(VT);
7136 // Check if N implements a horizontal binop.
7137 for (unsigned i = 0, e = NumElts; i != e && CanFold; ++i) {
7138 SDValue Op = N->getOperand(i + BaseIdx);
7141 if (Op->isUndef()) {
7142 // Update the expected vector extract index.
7143 if (i * 2 == NumElts)
7144 ExpectedVExtractIdx = BaseIdx;
7145 ExpectedVExtractIdx += 2;
7149 CanFold = Op->getOpcode() == Opcode && Op->hasOneUse();
7154 SDValue Op0 = Op.getOperand(0);
7155 SDValue Op1 = Op.getOperand(1);
7157 // Try to match the following pattern:
7158 // (BINOP (extract_vector_elt A, I), (extract_vector_elt A, I+1))
7159 CanFold = (Op0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7160 Op1.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7161 Op0.getOperand(0) == Op1.getOperand(0) &&
7162 isa<ConstantSDNode>(Op0.getOperand(1)) &&
7163 isa<ConstantSDNode>(Op1.getOperand(1)));
7167 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7168 unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
7170 if (i * 2 < NumElts) {
7172 V0 = Op0.getOperand(0);
7173 if (V0.getValueType() != VT)
7178 V1 = Op0.getOperand(0);
7179 if (V1.getValueType() != VT)
7182 if (i * 2 == NumElts)
7183 ExpectedVExtractIdx = BaseIdx;
7186 SDValue Expected = (i * 2 < NumElts) ? V0 : V1;
7187 if (I0 == ExpectedVExtractIdx)
7188 CanFold = I1 == I0 + 1 && Op0.getOperand(0) == Expected;
7189 else if (IsCommutable && I1 == ExpectedVExtractIdx) {
7190 // Try to match the following dag sequence:
7191 // (BINOP (extract_vector_elt A, I+1), (extract_vector_elt A, I))
7192 CanFold = I0 == I1 + 1 && Op1.getOperand(0) == Expected;
7196 ExpectedVExtractIdx += 2;
7202 /// \brief Emit a sequence of two 128-bit horizontal add/sub followed by
7203 /// a concat_vector.
7205 /// This is a helper function of LowerToHorizontalOp().
7206 /// This function expects two 256-bit vectors called V0 and V1.
7207 /// At first, each vector is split into two separate 128-bit vectors.
7208 /// Then, the resulting 128-bit vectors are used to implement two
7209 /// horizontal binary operations.
7211 /// The kind of horizontal binary operation is defined by \p X86Opcode.
7213 /// \p Mode specifies how the 128-bit parts of V0 and V1 are passed in input to
7214 /// the two new horizontal binop.
7215 /// When Mode is set, the first horizontal binop dag node would take as input
7216 /// the lower 128-bit of V0 and the upper 128-bit of V0. The second
7217 /// horizontal binop dag node would take as input the lower 128-bit of V1
7218 /// and the upper 128-bit of V1.
7220 /// HADD V0_LO, V0_HI
7221 /// HADD V1_LO, V1_HI
7223 /// Otherwise, the first horizontal binop dag node takes as input the lower
7224 /// 128-bit of V0 and the lower 128-bit of V1, and the second horizontal binop
7225 /// dag node takes the upper 128-bit of V0 and the upper 128-bit of V1.
7227 /// HADD V0_LO, V1_LO
7228 /// HADD V0_HI, V1_HI
7230 /// If \p isUndefLO is set, then the algorithm propagates UNDEF to the lower
7231 /// 128-bits of the result. If \p isUndefHI is set, then UNDEF is propagated to
7232 /// the upper 128-bits of the result.
7233 static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
7234 const SDLoc &DL, SelectionDAG &DAG,
7235 unsigned X86Opcode, bool Mode,
7236 bool isUndefLO, bool isUndefHI) {
7237 MVT VT = V0.getSimpleValueType();
7238 assert(VT.is256BitVector() && VT == V1.getSimpleValueType() &&
7239 "Invalid nodes in input!");
7241 unsigned NumElts = VT.getVectorNumElements();
7242 SDValue V0_LO = extract128BitVector(V0, 0, DAG, DL);
7243 SDValue V0_HI = extract128BitVector(V0, NumElts/2, DAG, DL);
7244 SDValue V1_LO = extract128BitVector(V1, 0, DAG, DL);
7245 SDValue V1_HI = extract128BitVector(V1, NumElts/2, DAG, DL);
7246 MVT NewVT = V0_LO.getSimpleValueType();
7248 SDValue LO = DAG.getUNDEF(NewVT);
7249 SDValue HI = DAG.getUNDEF(NewVT);
7252 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7253 if (!isUndefLO && !V0->isUndef())
7254 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V0_HI);
7255 if (!isUndefHI && !V1->isUndef())
7256 HI = DAG.getNode(X86Opcode, DL, NewVT, V1_LO, V1_HI);
7258 // Don't emit a horizontal binop if the result is expected to be UNDEF.
7259 if (!isUndefLO && (!V0_LO->isUndef() || !V1_LO->isUndef()))
7260 LO = DAG.getNode(X86Opcode, DL, NewVT, V0_LO, V1_LO);
7262 if (!isUndefHI && (!V0_HI->isUndef() || !V1_HI->isUndef()))
7263 HI = DAG.getNode(X86Opcode, DL, NewVT, V0_HI, V1_HI);
7266 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LO, HI);
7269 /// Returns true iff \p BV builds a vector with the result equivalent to
7270 /// the result of ADDSUB operation.
7271 /// If true is returned then the operands of ADDSUB = Opnd0 +- Opnd1 operation
7272 /// are written to the parameters \p Opnd0 and \p Opnd1.
7273 static bool isAddSub(const BuildVectorSDNode *BV,
7274 const X86Subtarget &Subtarget, SelectionDAG &DAG,
7275 SDValue &Opnd0, SDValue &Opnd1) {
7277 MVT VT = BV->getSimpleValueType(0);
7278 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
7279 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
7280 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
7283 unsigned NumElts = VT.getVectorNumElements();
7284 SDValue InVec0 = DAG.getUNDEF(VT);
7285 SDValue InVec1 = DAG.getUNDEF(VT);
7287 // Odd-numbered elements in the input build vector are obtained from
7288 // adding two integer/float elements.
7289 // Even-numbered elements in the input build vector are obtained from
7290 // subtracting two integer/float elements.
7291 unsigned ExpectedOpcode = ISD::FSUB;
7292 unsigned NextExpectedOpcode = ISD::FADD;
7293 bool AddFound = false;
7294 bool SubFound = false;
7296 for (unsigned i = 0, e = NumElts; i != e; ++i) {
7297 SDValue Op = BV->getOperand(i);
7299 // Skip 'undef' values.
7300 unsigned Opcode = Op.getOpcode();
7301 if (Opcode == ISD::UNDEF) {
7302 std::swap(ExpectedOpcode, NextExpectedOpcode);
7306 // Early exit if we found an unexpected opcode.
7307 if (Opcode != ExpectedOpcode)
7310 SDValue Op0 = Op.getOperand(0);
7311 SDValue Op1 = Op.getOperand(1);
7313 // Try to match the following pattern:
7314 // (BINOP (extract_vector_elt A, i), (extract_vector_elt B, i))
7315 // Early exit if we cannot match that sequence.
7316 if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7317 Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
7318 !isa<ConstantSDNode>(Op0.getOperand(1)) ||
7319 !isa<ConstantSDNode>(Op1.getOperand(1)) ||
7320 Op0.getOperand(1) != Op1.getOperand(1))
7323 unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
7327 // We found a valid add/sub node. Update the information accordingly.
7333 // Update InVec0 and InVec1.
7334 if (InVec0.isUndef()) {
7335 InVec0 = Op0.getOperand(0);
7336 if (InVec0.getSimpleValueType() != VT)
7339 if (InVec1.isUndef()) {
7340 InVec1 = Op1.getOperand(0);
7341 if (InVec1.getSimpleValueType() != VT)
7345 // Make sure that operands in input to each add/sub node always
7346 // come from a same pair of vectors.
7347 if (InVec0 != Op0.getOperand(0)) {
7348 if (ExpectedOpcode == ISD::FSUB)
7351 // FADD is commutable. Try to commute the operands
7352 // and then test again.
7353 std::swap(Op0, Op1);
7354 if (InVec0 != Op0.getOperand(0))
7358 if (InVec1 != Op1.getOperand(0))
7361 // Update the pair of expected opcodes.
7362 std::swap(ExpectedOpcode, NextExpectedOpcode);
7365 // Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
7366 if (!AddFound || !SubFound || InVec0.isUndef() || InVec1.isUndef())
7374 /// Returns true if is possible to fold MUL and an idiom that has already been
7375 /// recognized as ADDSUB(\p Opnd0, \p Opnd1) into FMADDSUB(x, y, \p Opnd1).
7376 /// If (and only if) true is returned, the operands of FMADDSUB are written to
7377 /// parameters \p Opnd0, \p Opnd1, \p Opnd2.
7379 /// Prior to calling this function it should be known that there is some
7380 /// SDNode that potentially can be replaced with an X86ISD::ADDSUB operation
7381 /// using \p Opnd0 and \p Opnd1 as operands. Also, this method is called
7382 /// before replacement of such SDNode with ADDSUB operation. Thus the number
7383 /// of \p Opnd0 uses is expected to be equal to 2.
7384 /// For example, this function may be called for the following IR:
7385 /// %AB = fmul fast <2 x double> %A, %B
7386 /// %Sub = fsub fast <2 x double> %AB, %C
7387 /// %Add = fadd fast <2 x double> %AB, %C
7388 /// %Addsub = shufflevector <2 x double> %Sub, <2 x double> %Add,
7389 /// <2 x i32> <i32 0, i32 3>
7390 /// There is a def for %Addsub here, which potentially can be replaced by
7391 /// X86ISD::ADDSUB operation:
7392 /// %Addsub = X86ISD::ADDSUB %AB, %C
7393 /// and such ADDSUB can further be replaced with FMADDSUB:
7394 /// %Addsub = FMADDSUB %A, %B, %C.
7396 /// The main reason why this method is called before the replacement of the
7397 /// recognized ADDSUB idiom with ADDSUB operation is that such replacement
7398 /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
7400 static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
7401 SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
7402 if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
7403 !Subtarget.hasAnyFMA())
7406 // FIXME: These checks must match the similar ones in
7407 // DAGCombiner::visitFADDForFMACombine. It would be good to have one
7408 // function that would answer if it is Ok to fuse MUL + ADD to FMADD
7409 // or MUL + ADDSUB to FMADDSUB.
7410 const TargetOptions &Options = DAG.getTarget().Options;
7412 (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
7417 Opnd1 = Opnd0.getOperand(1);
7418 Opnd0 = Opnd0.getOperand(0);
7423 /// Try to fold a build_vector that performs an 'addsub' or 'fmaddsub' operation
7424 /// accordingly to X86ISD::ADDSUB or X86ISD::FMADDSUB node.
7425 static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
7426 const X86Subtarget &Subtarget,
7427 SelectionDAG &DAG) {
7428 SDValue Opnd0, Opnd1;
7429 if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
7432 MVT VT = BV->getSimpleValueType(0);
7435 // Try to generate X86ISD::FMADDSUB node here.
7437 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
7438 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
7440 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
7441 // the ADDSUB idiom has been successfully recognized. There are no known
7442 // X86 targets with 512-bit ADDSUB instructions!
7443 // 512-bit ADDSUB idiom recognition was needed only as part of FMADDSUB idiom
7445 if (VT.is512BitVector())
7448 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
7451 /// Lower BUILD_VECTOR to a horizontal add/sub operation if possible.
7452 static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
7453 const X86Subtarget &Subtarget,
7454 SelectionDAG &DAG) {
7455 MVT VT = BV->getSimpleValueType(0);
7456 unsigned NumElts = VT.getVectorNumElements();
7457 unsigned NumUndefsLO = 0;
7458 unsigned NumUndefsHI = 0;
7459 unsigned Half = NumElts/2;
7461 // Count the number of UNDEF operands in the build_vector in input.
7462 for (unsigned i = 0, e = Half; i != e; ++i)
7463 if (BV->getOperand(i)->isUndef())
7466 for (unsigned i = Half, e = NumElts; i != e; ++i)
7467 if (BV->getOperand(i)->isUndef())
7470 // Early exit if this is either a build_vector of all UNDEFs or all the
7471 // operands but one are UNDEF.
7472 if (NumUndefsLO + NumUndefsHI + 1 >= NumElts)
7476 SDValue InVec0, InVec1;
7477 if ((VT == MVT::v4f32 || VT == MVT::v2f64) && Subtarget.hasSSE3()) {
7478 // Try to match an SSE3 float HADD/HSUB.
7479 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7480 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7482 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7483 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7484 } else if ((VT == MVT::v4i32 || VT == MVT::v8i16) && Subtarget.hasSSSE3()) {
7485 // Try to match an SSSE3 integer HADD/HSUB.
7486 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7487 return DAG.getNode(X86ISD::HADD, DL, VT, InVec0, InVec1);
7489 if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7490 return DAG.getNode(X86ISD::HSUB, DL, VT, InVec0, InVec1);
7493 if (!Subtarget.hasAVX())
7496 if ((VT == MVT::v8f32 || VT == MVT::v4f64)) {
7497 // Try to match an AVX horizontal add/sub of packed single/double
7498 // precision floating point values from 256-bit vectors.
7499 SDValue InVec2, InVec3;
7500 if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, Half, InVec0, InVec1) &&
7501 isHorizontalBinOp(BV, ISD::FADD, DAG, Half, NumElts, InVec2, InVec3) &&
7502 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7503 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7504 return DAG.getNode(X86ISD::FHADD, DL, VT, InVec0, InVec1);
7506 if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, Half, InVec0, InVec1) &&
7507 isHorizontalBinOp(BV, ISD::FSUB, DAG, Half, NumElts, InVec2, InVec3) &&
7508 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7509 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7510 return DAG.getNode(X86ISD::FHSUB, DL, VT, InVec0, InVec1);
7511 } else if (VT == MVT::v8i32 || VT == MVT::v16i16) {
7512 // Try to match an AVX2 horizontal add/sub of signed integers.
7513 SDValue InVec2, InVec3;
7515 bool CanFold = true;
7517 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, Half, InVec0, InVec1) &&
7518 isHorizontalBinOp(BV, ISD::ADD, DAG, Half, NumElts, InVec2, InVec3) &&
7519 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7520 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7521 X86Opcode = X86ISD::HADD;
7522 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, Half, InVec0, InVec1) &&
7523 isHorizontalBinOp(BV, ISD::SUB, DAG, Half, NumElts, InVec2, InVec3) &&
7524 ((InVec0.isUndef() || InVec2.isUndef()) || InVec0 == InVec2) &&
7525 ((InVec1.isUndef() || InVec3.isUndef()) || InVec1 == InVec3))
7526 X86Opcode = X86ISD::HSUB;
7531 // Fold this build_vector into a single horizontal add/sub.
7532 // Do this only if the target has AVX2.
7533 if (Subtarget.hasAVX2())
7534 return DAG.getNode(X86Opcode, DL, VT, InVec0, InVec1);
7536 // Do not try to expand this build_vector into a pair of horizontal
7537 // add/sub if we can emit a pair of scalar add/sub.
7538 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7541 // Convert this build_vector into a pair of horizontal binop followed by
7543 bool isUndefLO = NumUndefsLO == Half;
7544 bool isUndefHI = NumUndefsHI == Half;
7545 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, false,
7546 isUndefLO, isUndefHI);
7550 if ((VT == MVT::v8f32 || VT == MVT::v4f64 || VT == MVT::v8i32 ||
7551 VT == MVT::v16i16) && Subtarget.hasAVX()) {
7553 if (isHorizontalBinOp(BV, ISD::ADD, DAG, 0, NumElts, InVec0, InVec1))
7554 X86Opcode = X86ISD::HADD;
7555 else if (isHorizontalBinOp(BV, ISD::SUB, DAG, 0, NumElts, InVec0, InVec1))
7556 X86Opcode = X86ISD::HSUB;
7557 else if (isHorizontalBinOp(BV, ISD::FADD, DAG, 0, NumElts, InVec0, InVec1))
7558 X86Opcode = X86ISD::FHADD;
7559 else if (isHorizontalBinOp(BV, ISD::FSUB, DAG, 0, NumElts, InVec0, InVec1))
7560 X86Opcode = X86ISD::FHSUB;
7564 // Don't try to expand this build_vector into a pair of horizontal add/sub
7565 // if we can simply emit a pair of scalar add/sub.
7566 if (NumUndefsLO + 1 == Half || NumUndefsHI + 1 == Half)
7569 // Convert this build_vector into two horizontal add/sub followed by
7571 bool isUndefLO = NumUndefsLO == Half;
7572 bool isUndefHI = NumUndefsHI == Half;
7573 return ExpandHorizontalBinOp(InVec0, InVec1, DL, DAG, X86Opcode, true,
7574 isUndefLO, isUndefHI);
7580 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
7581 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
7582 /// just apply the bit to the vectors.
7583 /// NOTE: Its not in our interest to start make a general purpose vectorizer
7584 /// from this, but enough scalar bit operations are created from the later
7585 /// legalization + scalarization stages to need basic support.
7586 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
7587 SelectionDAG &DAG) {
7589 MVT VT = Op->getSimpleValueType(0);
7590 unsigned NumElems = VT.getVectorNumElements();
7591 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
7593 // Check that all elements have the same opcode.
7594 // TODO: Should we allow UNDEFS and if so how many?
7595 unsigned Opcode = Op->getOperand(0).getOpcode();
7596 for (unsigned i = 1; i < NumElems; ++i)
7597 if (Opcode != Op->getOperand(i).getOpcode())
7600 // TODO: We may be able to add support for other Ops (ADD/SUB + shifts).
7607 if (!TLI.isOperationLegalOrPromote(Opcode, VT))
7612 SmallVector<SDValue, 4> LHSElts, RHSElts;
7613 for (SDValue Elt : Op->ops()) {
7614 SDValue LHS = Elt.getOperand(0);
7615 SDValue RHS = Elt.getOperand(1);
7617 // We expect the canonicalized RHS operand to be the constant.
7618 if (!isa<ConstantSDNode>(RHS))
7620 LHSElts.push_back(LHS);
7621 RHSElts.push_back(RHS);
7624 SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
7625 SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
7626 return DAG.getNode(Opcode, DL, VT, LHS, RHS);
7629 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
7630 /// functionality to do this, so it's all zeros, all ones, or some derivation
7631 /// that is cheap to calculate.
7632 static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
7633 const X86Subtarget &Subtarget) {
7635 MVT VT = Op.getSimpleValueType();
7637 // Vectors containing all zeros can be matched by pxor and xorps.
7638 if (ISD::isBuildVectorAllZeros(Op.getNode())) {
7639 // Canonicalize this to <4 x i32> to 1) ensure the zero vectors are CSE'd
7640 // and 2) ensure that i64 scalars are eliminated on x86-32 hosts.
7641 if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32)
7644 return getZeroVector(VT, Subtarget, DAG, DL);
7647 // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
7648 // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
7649 // vpcmpeqd on 256-bit vectors.
7650 if (Subtarget.hasSSE2() && ISD::isBuildVectorAllOnes(Op.getNode())) {
7651 if (VT == MVT::v4i32 || VT == MVT::v16i32 ||
7652 (VT == MVT::v8i32 && Subtarget.hasInt256()))
7655 return getOnesVector(VT, DAG, DL);
7662 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
7665 MVT VT = Op.getSimpleValueType();
7666 MVT ExtVT = VT.getVectorElementType();
7667 unsigned NumElems = Op.getNumOperands();
7669 // Generate vectors for predicate vectors.
7670 if (VT.getVectorElementType() == MVT::i1 && Subtarget.hasAVX512())
7671 return LowerBUILD_VECTORvXi1(Op, DAG);
7673 if (SDValue VectorConstant = materializeVectorConstant(Op, DAG, Subtarget))
7674 return VectorConstant;
7676 BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
7677 if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
7679 if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
7680 return HorizontalOp;
7681 if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
7683 if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
7686 unsigned EVTBits = ExtVT.getSizeInBits();
7688 unsigned NumZero = 0;
7689 unsigned NumNonZero = 0;
7690 uint64_t NonZeros = 0;
7691 bool IsAllConstants = true;
7692 SmallSet<SDValue, 8> Values;
7693 for (unsigned i = 0; i < NumElems; ++i) {
7694 SDValue Elt = Op.getOperand(i);
7698 if (Elt.getOpcode() != ISD::Constant &&
7699 Elt.getOpcode() != ISD::ConstantFP)
7700 IsAllConstants = false;
7701 if (X86::isZeroNode(Elt))
7704 assert(i < sizeof(NonZeros) * 8); // Make sure the shift is within range.
7705 NonZeros |= ((uint64_t)1 << i);
7710 // All undef vector. Return an UNDEF. All zero vectors were handled above.
7711 if (NumNonZero == 0)
7712 return DAG.getUNDEF(VT);
7714 // Special case for single non-zero, non-undef, element.
7715 if (NumNonZero == 1) {
7716 unsigned Idx = countTrailingZeros(NonZeros);
7717 SDValue Item = Op.getOperand(Idx);
7719 // If this is an insertion of an i64 value on x86-32, and if the top bits of
7720 // the value are obviously zero, truncate the value to i32 and do the
7721 // insertion that way. Only do this if the value is non-constant or if the
7722 // value is a constant being inserted into element 0. It is cheaper to do
7723 // a constant pool load than it is to do a movd + shuffle.
7724 if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
7725 (!IsAllConstants || Idx == 0)) {
7726 if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
7728 assert(VT == MVT::v2i64 && "Expected an SSE value type!");
7729 MVT VecVT = MVT::v4i32;
7731 // Truncate the value (which may itself be a constant) to i32, and
7732 // convert it to a vector with movd (S2V+shuffle to zero extend).
7733 Item = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Item);
7734 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Item);
7735 return DAG.getBitcast(VT, getShuffleVectorZeroOrUndef(
7736 Item, Idx * 2, true, Subtarget, DAG));
7740 // If we have a constant or non-constant insertion into the low element of
7741 // a vector, we can do this with SCALAR_TO_VECTOR + shuffle of zero into
7742 // the rest of the elements. This will be matched as movd/movq/movss/movsd
7743 // depending on what the source datatype is.
7746 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7748 if (ExtVT == MVT::i32 || ExtVT == MVT::f32 || ExtVT == MVT::f64 ||
7749 (ExtVT == MVT::i64 && Subtarget.is64Bit())) {
7750 assert((VT.is128BitVector() || VT.is256BitVector() ||
7751 VT.is512BitVector()) &&
7752 "Expected an SSE value type!");
7753 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7754 // Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
7755 return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7758 // We can't directly insert an i8 or i16 into a vector, so zero extend
7760 if (ExtVT == MVT::i16 || ExtVT == MVT::i8) {
7761 Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
7762 if (VT.getSizeInBits() >= 256) {
7763 MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
7764 if (Subtarget.hasAVX()) {
7765 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, Item);
7766 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7768 // Without AVX, we need to extend to a 128-bit vector and then
7769 // insert into the 256-bit vector.
7770 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7771 SDValue ZeroVec = getZeroVector(ShufVT, Subtarget, DAG, dl);
7772 Item = insert128BitVector(ZeroVec, Item, 0, DAG, dl);
7775 assert(VT.is128BitVector() && "Expected an SSE value type!");
7776 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
7777 Item = getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
7779 return DAG.getBitcast(VT, Item);
7783 // Is it a vector logical left shift?
7784 if (NumElems == 2 && Idx == 1 &&
7785 X86::isZeroNode(Op.getOperand(0)) &&
7786 !X86::isZeroNode(Op.getOperand(1))) {
7787 unsigned NumBits = VT.getSizeInBits();
7788 return getVShift(true, VT,
7789 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
7790 VT, Op.getOperand(1)),
7791 NumBits/2, DAG, *this, dl);
7794 if (IsAllConstants) // Otherwise, it's better to do a constpool load.
7797 // Otherwise, if this is a vector with i32 or f32 elements, and the element
7798 // is a non-constant being inserted into an element other than the low one,
7799 // we can't use a constant pool load. Instead, use SCALAR_TO_VECTOR (aka
7800 // movd/movss) to move this into the low element, then shuffle it into
7802 if (EVTBits == 32) {
7803 Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
7804 return getShuffleVectorZeroOrUndef(Item, Idx, NumZero > 0, Subtarget, DAG);
7808 // Splat is obviously ok. Let legalizer expand it to a shuffle.
7809 if (Values.size() == 1) {
7810 if (EVTBits == 32) {
7811 // Instead of a shuffle like this:
7812 // shuffle (scalar_to_vector (load (ptr + 4))), undef, <0, 0, 0, 0>
7813 // Check if it's possible to issue this instead.
7814 // shuffle (vload ptr)), undef, <1, 1, 1, 1>
7815 unsigned Idx = countTrailingZeros(NonZeros);
7816 SDValue Item = Op.getOperand(Idx);
7817 if (Op.getNode()->isOnlyUserOf(Item.getNode()))
7818 return LowerAsSplatVectorLoad(Item, VT, dl, DAG);
7823 // A vector full of immediates; various special cases are already
7824 // handled, so this is best done with a single constant-pool load.
7828 // See if we can use a vector load to get all of the elements.
7829 if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
7830 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7832 EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
7836 // For AVX-length vectors, build the individual 128-bit pieces and use
7837 // shuffles to put them in place.
7838 if (VT.is256BitVector() || VT.is512BitVector()) {
7839 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
7841 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
7843 // Build both the lower and upper subvector.
7845 DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
7846 SDValue Upper = DAG.getBuildVector(
7847 HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
7849 // Recreate the wider vector with the lower and upper part.
7850 if (VT.is256BitVector())
7851 return concat128BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7852 return concat256BitVectors(Lower, Upper, VT, NumElems, DAG, dl);
7855 // Let legalizer expand 2-wide build_vectors.
7856 if (EVTBits == 64) {
7857 if (NumNonZero == 1) {
7858 // One half is zero or undef.
7859 unsigned Idx = countTrailingZeros(NonZeros);
7860 SDValue V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT,
7861 Op.getOperand(Idx));
7862 return getShuffleVectorZeroOrUndef(V2, Idx, true, Subtarget, DAG);
7867 // If element VT is < 32 bits, convert it to inserts into a zero vector.
7868 if (EVTBits == 8 && NumElems == 16)
7869 if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
7873 if (EVTBits == 16 && NumElems == 8)
7874 if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
7878 // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
7879 if (EVTBits == 32 && NumElems == 4)
7880 if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
7883 // If element VT is == 32 bits, turn it into a number of shuffles.
7884 if (NumElems == 4 && NumZero > 0) {
7885 SmallVector<SDValue, 8> Ops(NumElems);
7886 for (unsigned i = 0; i < 4; ++i) {
7887 bool isZero = !(NonZeros & (1ULL << i));
7889 Ops[i] = getZeroVector(VT, Subtarget, DAG, dl);
7891 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7894 for (unsigned i = 0; i < 2; ++i) {
7895 switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
7898 Ops[i] = Ops[i*2]; // Must be a zero vector.
7901 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2+1], Ops[i*2]);
7904 Ops[i] = getMOVL(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7907 Ops[i] = getUnpackl(DAG, dl, VT, Ops[i*2], Ops[i*2+1]);
7912 bool Reverse1 = (NonZeros & 0x3) == 2;
7913 bool Reverse2 = ((NonZeros & (0x3 << 2)) >> 2) == 2;
7917 static_cast<int>(Reverse2 ? NumElems+1 : NumElems),
7918 static_cast<int>(Reverse2 ? NumElems : NumElems+1)
7920 return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
7923 if (Values.size() > 1 && VT.is128BitVector()) {
7924 // Check for a build vector from mostly shuffle plus few inserting.
7925 if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
7928 // For SSE 4.1, use insertps to put the high elements into the low element.
7929 if (Subtarget.hasSSE41()) {
7931 if (!Op.getOperand(0).isUndef())
7932 Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
7934 Result = DAG.getUNDEF(VT);
7936 for (unsigned i = 1; i < NumElems; ++i) {
7937 if (Op.getOperand(i).isUndef()) continue;
7938 Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
7939 Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
7944 // Otherwise, expand into a number of unpckl*, start by extending each of
7945 // our (non-undef) elements to the full vector width with the element in the
7946 // bottom slot of the vector (which generates no code for SSE).
7947 SmallVector<SDValue, 8> Ops(NumElems);
7948 for (unsigned i = 0; i < NumElems; ++i) {
7949 if (!Op.getOperand(i).isUndef())
7950 Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
7952 Ops[i] = DAG.getUNDEF(VT);
7955 // Next, we iteratively mix elements, e.g. for v4f32:
7956 // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
7957 // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
7958 // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
7959 for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
7960 // Generate scaled UNPCKL shuffle mask.
7961 SmallVector<int, 16> Mask;
7962 for(unsigned i = 0; i != Scale; ++i)
7964 for (unsigned i = 0; i != Scale; ++i)
7965 Mask.push_back(NumElems+i);
7966 Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
7968 for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
7969 Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
7976 // 256-bit AVX can use the vinsertf128 instruction
7977 // to create 256-bit vectors from two other 128-bit ones.
7978 static SDValue LowerAVXCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) {
7980 MVT ResVT = Op.getSimpleValueType();
7982 assert((ResVT.is256BitVector() ||
7983 ResVT.is512BitVector()) && "Value type must be 256-/512-bit wide");
7985 SDValue V1 = Op.getOperand(0);
7986 SDValue V2 = Op.getOperand(1);
7987 unsigned NumElems = ResVT.getVectorNumElements();
7988 if (ResVT.is256BitVector())
7989 return concat128BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
7991 if (Op.getNumOperands() == 4) {
7992 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
7993 ResVT.getVectorNumElements()/2);
7994 SDValue V3 = Op.getOperand(2);
7995 SDValue V4 = Op.getOperand(3);
7996 return concat256BitVectors(
7997 concat128BitVectors(V1, V2, HalfVT, NumElems / 2, DAG, dl),
7998 concat128BitVectors(V3, V4, HalfVT, NumElems / 2, DAG, dl), ResVT,
8001 return concat256BitVectors(V1, V2, ResVT, NumElems, DAG, dl);
8004 // Return true if all the operands of the given CONCAT_VECTORS node are zeros
8005 // except for the first one. (CONCAT_VECTORS Op, 0, 0,...,0)
8006 static bool isExpandWithZeros(const SDValue &Op) {
8007 assert(Op.getOpcode() == ISD::CONCAT_VECTORS &&
8008 "Expand with zeros only possible in CONCAT_VECTORS nodes!");
8010 for (unsigned i = 1; i < Op.getNumOperands(); i++)
8011 if (!ISD::isBuildVectorAllZeros(Op.getOperand(i).getNode()))
8017 // Returns true if the given node is a type promotion (by concatenating i1
8018 // zeros) of the result of a node that already zeros all upper bits of
8020 static SDValue isTypePromotionOfi1ZeroUpBits(SDValue Op) {
8021 unsigned Opc = Op.getOpcode();
8023 assert(Opc == ISD::CONCAT_VECTORS &&
8024 Op.getSimpleValueType().getVectorElementType() == MVT::i1 &&
8025 "Unexpected node to check for type promotion!");
8027 // As long as we are concatenating zeros to the upper part of a previous node
8028 // result, climb up the tree until a node with different opcode is
8030 while (Opc == ISD::INSERT_SUBVECTOR || Opc == ISD::CONCAT_VECTORS) {
8031 if (Opc == ISD::INSERT_SUBVECTOR) {
8032 if (ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()) &&
8033 Op.getConstantOperandVal(2) == 0)
8034 Op = Op.getOperand(1);
8037 } else { // Opc == ISD::CONCAT_VECTORS
8038 if (isExpandWithZeros(Op))
8039 Op = Op.getOperand(0);
8043 Opc = Op.getOpcode();
8046 // Check if the first inserted node zeroes the upper bits, or an 'and' result
8047 // of a node that zeros the upper bits (its masked version).
8048 if (isMaskedZeroUpperBitsvXi1(Op.getOpcode()) ||
8049 (Op.getOpcode() == ISD::AND &&
8050 (isMaskedZeroUpperBitsvXi1(Op.getOperand(0).getOpcode()) ||
8051 isMaskedZeroUpperBitsvXi1(Op.getOperand(1).getOpcode())))) {
8058 static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
8059 const X86Subtarget &Subtarget,
8060 SelectionDAG & DAG) {
8062 MVT ResVT = Op.getSimpleValueType();
8063 unsigned NumOfOperands = Op.getNumOperands();
8065 assert(isPowerOf2_32(NumOfOperands) &&
8066 "Unexpected number of operands in CONCAT_VECTORS");
8068 // If this node promotes - by concatenating zeroes - the type of the result
8069 // of a node with instruction that zeroes all upper (irrelevant) bits of the
8070 // output register, mark it as legal and catch the pattern in instruction
8071 // selection to avoid emitting extra insturctions (for zeroing upper bits).
8072 if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
8073 SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
8074 SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
8075 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
8079 SDValue Undef = DAG.getUNDEF(ResVT);
8080 if (NumOfOperands > 2) {
8081 // Specialize the cases when all, or all but one, of the operands are undef.
8082 unsigned NumOfDefinedOps = 0;
8084 for (unsigned i = 0; i < NumOfOperands; i++)
8085 if (!Op.getOperand(i).isUndef()) {
8089 if (NumOfDefinedOps == 0)
8091 if (NumOfDefinedOps == 1) {
8092 unsigned SubVecNumElts =
8093 Op.getOperand(OpIdx).getValueType().getVectorNumElements();
8094 SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
8095 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
8096 Op.getOperand(OpIdx), IdxVal);
8099 MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
8100 ResVT.getVectorNumElements()/2);
8101 SmallVector<SDValue, 2> Ops;
8102 for (unsigned i = 0; i < NumOfOperands/2; i++)
8103 Ops.push_back(Op.getOperand(i));
8104 SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8106 for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
8107 Ops.push_back(Op.getOperand(i));
8108 SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
8109 return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
8113 SDValue V1 = Op.getOperand(0);
8114 SDValue V2 = Op.getOperand(1);
8115 unsigned NumElems = ResVT.getVectorNumElements();
8116 assert(V1.getValueType() == V2.getValueType() &&
8117 V1.getValueType().getVectorNumElements() == NumElems/2 &&
8118 "Unexpected operands in CONCAT_VECTORS");
8120 if (ResVT.getSizeInBits() >= 16)
8121 return Op; // The operation is legal with KUNPCK
8123 bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
8124 bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
8125 SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
8126 if (IsZeroV1 && IsZeroV2)
8129 SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
8131 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8133 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
8135 SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
8137 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
8140 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
8142 V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
8143 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
8146 static SDValue LowerCONCAT_VECTORS(SDValue Op,
8147 const X86Subtarget &Subtarget,
8148 SelectionDAG &DAG) {
8149 MVT VT = Op.getSimpleValueType();
8150 if (VT.getVectorElementType() == MVT::i1)
8151 return LowerCONCAT_VECTORSvXi1(Op, Subtarget, DAG);
8153 assert((VT.is256BitVector() && Op.getNumOperands() == 2) ||
8154 (VT.is512BitVector() && (Op.getNumOperands() == 2 ||
8155 Op.getNumOperands() == 4)));
8157 // AVX can use the vinsertf128 instruction to create 256-bit vectors
8158 // from two other 128-bit ones.
8160 // 512-bit vector may contain 2 256-bit vectors or 4 128-bit vectors
8161 return LowerAVXCONCAT_VECTORS(Op, DAG);
8164 //===----------------------------------------------------------------------===//
8165 // Vector shuffle lowering
8167 // This is an experimental code path for lowering vector shuffles on x86. It is
8168 // designed to handle arbitrary vector shuffles and blends, gracefully
8169 // degrading performance as necessary. It works hard to recognize idiomatic
8170 // shuffles and lower them to optimal instruction patterns without leaving
8171 // a framework that allows reasonably efficient handling of all vector shuffle
8173 //===----------------------------------------------------------------------===//
8175 /// \brief Tiny helper function to identify a no-op mask.
8177 /// This is a somewhat boring predicate function. It checks whether the mask
8178 /// array input, which is assumed to be a single-input shuffle mask of the kind
8179 /// used by the X86 shuffle instructions (not a fully general
8180 /// ShuffleVectorSDNode mask) requires any shuffles to occur. Both undef and an
8181 /// in-place shuffle are 'no-op's.
8182 static bool isNoopShuffleMask(ArrayRef<int> Mask) {
8183 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8184 assert(Mask[i] >= -1 && "Out of bound mask element!");
8185 if (Mask[i] >= 0 && Mask[i] != i)
8191 /// \brief Test whether there are elements crossing 128-bit lanes in this
8194 /// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
8195 /// and we routinely test for these.
8196 static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
8197 int LaneSize = 128 / VT.getScalarSizeInBits();
8198 int Size = Mask.size();
8199 for (int i = 0; i < Size; ++i)
8200 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
8205 /// \brief Test whether a shuffle mask is equivalent within each sub-lane.
8207 /// This checks a shuffle mask to see if it is performing the same
8208 /// lane-relative shuffle in each sub-lane. This trivially implies
8209 /// that it is also not lane-crossing. It may however involve a blend from the
8210 /// same lane of a second vector.
8212 /// The specific repeated shuffle mask is populated in \p RepeatedMask, as it is
8213 /// non-trivial to compute in the face of undef lanes. The representation is
8214 /// suitable for use with existing 128-bit shuffles as entries from the second
8215 /// vector have been remapped to [LaneSize, 2*LaneSize).
8216 static bool isRepeatedShuffleMask(unsigned LaneSizeInBits, MVT VT,
8218 SmallVectorImpl<int> &RepeatedMask) {
8219 auto LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8220 RepeatedMask.assign(LaneSize, -1);
8221 int Size = Mask.size();
8222 for (int i = 0; i < Size; ++i) {
8223 assert(Mask[i] == SM_SentinelUndef || Mask[i] >= 0);
8226 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8227 // This entry crosses lanes, so there is no way to model this shuffle.
8230 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8231 // Adjust second vector indices to start at LaneSize instead of Size.
8232 int LocalM = Mask[i] < Size ? Mask[i] % LaneSize
8233 : Mask[i] % LaneSize + LaneSize;
8234 if (RepeatedMask[i % LaneSize] < 0)
8235 // This is the first non-undef entry in this slot of a 128-bit lane.
8236 RepeatedMask[i % LaneSize] = LocalM;
8237 else if (RepeatedMask[i % LaneSize] != LocalM)
8238 // Found a mismatch with the repeated mask.
8244 /// Test whether a shuffle mask is equivalent within each 128-bit lane.
8246 is128BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8247 SmallVectorImpl<int> &RepeatedMask) {
8248 return isRepeatedShuffleMask(128, VT, Mask, RepeatedMask);
8251 /// Test whether a shuffle mask is equivalent within each 256-bit lane.
8253 is256BitLaneRepeatedShuffleMask(MVT VT, ArrayRef<int> Mask,
8254 SmallVectorImpl<int> &RepeatedMask) {
8255 return isRepeatedShuffleMask(256, VT, Mask, RepeatedMask);
8258 /// Test whether a target shuffle mask is equivalent within each sub-lane.
8259 /// Unlike isRepeatedShuffleMask we must respect SM_SentinelZero.
8260 static bool isRepeatedTargetShuffleMask(unsigned LaneSizeInBits, MVT VT,
8262 SmallVectorImpl<int> &RepeatedMask) {
8263 int LaneSize = LaneSizeInBits / VT.getScalarSizeInBits();
8264 RepeatedMask.assign(LaneSize, SM_SentinelUndef);
8265 int Size = Mask.size();
8266 for (int i = 0; i < Size; ++i) {
8267 assert(isUndefOrZero(Mask[i]) || (Mask[i] >= 0));
8268 if (Mask[i] == SM_SentinelUndef)
8270 if (Mask[i] == SM_SentinelZero) {
8271 if (!isUndefOrZero(RepeatedMask[i % LaneSize]))
8273 RepeatedMask[i % LaneSize] = SM_SentinelZero;
8276 if ((Mask[i] % Size) / LaneSize != i / LaneSize)
8277 // This entry crosses lanes, so there is no way to model this shuffle.
8280 // Ok, handle the in-lane shuffles by detecting if and when they repeat.
8281 // Adjust second vector indices to start at LaneSize instead of Size.
8283 Mask[i] < Size ? Mask[i] % LaneSize : Mask[i] % LaneSize + LaneSize;
8284 if (RepeatedMask[i % LaneSize] == SM_SentinelUndef)
8285 // This is the first non-undef entry in this slot of a 128-bit lane.
8286 RepeatedMask[i % LaneSize] = LocalM;
8287 else if (RepeatedMask[i % LaneSize] != LocalM)
8288 // Found a mismatch with the repeated mask.
8294 /// \brief Checks whether a shuffle mask is equivalent to an explicit list of
8297 /// This is a fast way to test a shuffle mask against a fixed pattern:
8299 /// if (isShuffleEquivalent(Mask, 3, 2, {1, 0})) { ... }
8301 /// It returns true if the mask is exactly as wide as the argument list, and
8302 /// each element of the mask is either -1 (signifying undef) or the value given
8303 /// in the argument.
8304 static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
8305 ArrayRef<int> ExpectedMask) {
8306 if (Mask.size() != ExpectedMask.size())
8309 int Size = Mask.size();
8311 // If the values are build vectors, we can look through them to find
8312 // equivalent inputs that make the shuffles equivalent.
8313 auto *BV1 = dyn_cast<BuildVectorSDNode>(V1);
8314 auto *BV2 = dyn_cast<BuildVectorSDNode>(V2);
8316 for (int i = 0; i < Size; ++i) {
8317 assert(Mask[i] >= -1 && "Out of bound mask element!");
8318 if (Mask[i] >= 0 && Mask[i] != ExpectedMask[i]) {
8319 auto *MaskBV = Mask[i] < Size ? BV1 : BV2;
8320 auto *ExpectedBV = ExpectedMask[i] < Size ? BV1 : BV2;
8321 if (!MaskBV || !ExpectedBV ||
8322 MaskBV->getOperand(Mask[i] % Size) !=
8323 ExpectedBV->getOperand(ExpectedMask[i] % Size))
8331 /// Checks whether a target shuffle mask is equivalent to an explicit pattern.
8333 /// The masks must be exactly the same width.
8335 /// If an element in Mask matches SM_SentinelUndef (-1) then the corresponding
8336 /// value in ExpectedMask is always accepted. Otherwise the indices must match.
8338 /// SM_SentinelZero is accepted as a valid negative index but must match in both.
8339 static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
8340 ArrayRef<int> ExpectedMask) {
8341 int Size = Mask.size();
8342 if (Size != (int)ExpectedMask.size())
8345 for (int i = 0; i < Size; ++i)
8346 if (Mask[i] == SM_SentinelUndef)
8348 else if (Mask[i] < 0 && Mask[i] != SM_SentinelZero)
8350 else if (Mask[i] != ExpectedMask[i])
8356 // Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
8358 static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
8359 const APInt &Zeroable) {
8360 int NumElts = Mask.size();
8361 assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
8363 SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
8364 for (int i = 0; i != NumElts; ++i) {
8366 if (M == SM_SentinelUndef)
8368 assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
8369 TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
8374 // Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
8376 static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
8377 if (VT != MVT::v8i32 && VT != MVT::v8f32)
8380 SmallVector<int, 8> Unpcklwd;
8381 createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
8382 /* Unary = */ false);
8383 SmallVector<int, 8> Unpckhwd;
8384 createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
8385 /* Unary = */ false);
8386 bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
8387 isTargetShuffleEquivalent(Mask, Unpckhwd));
8388 return IsUnpackwdMask;
8391 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
8393 /// This helper function produces an 8-bit shuffle immediate corresponding to
8394 /// the ubiquitous shuffle encoding scheme used in x86 instructions for
8395 /// shuffling 4 lanes. It can be used with most of the PSHUF instructions for
8398 /// NB: We rely heavily on "undef" masks preserving the input lane.
8399 static unsigned getV4X86ShuffleImm(ArrayRef<int> Mask) {
8400 assert(Mask.size() == 4 && "Only 4-lane shuffle masks");
8401 assert(Mask[0] >= -1 && Mask[0] < 4 && "Out of bound mask element!");
8402 assert(Mask[1] >= -1 && Mask[1] < 4 && "Out of bound mask element!");
8403 assert(Mask[2] >= -1 && Mask[2] < 4 && "Out of bound mask element!");
8404 assert(Mask[3] >= -1 && Mask[3] < 4 && "Out of bound mask element!");
8407 Imm |= (Mask[0] < 0 ? 0 : Mask[0]) << 0;
8408 Imm |= (Mask[1] < 0 ? 1 : Mask[1]) << 2;
8409 Imm |= (Mask[2] < 0 ? 2 : Mask[2]) << 4;
8410 Imm |= (Mask[3] < 0 ? 3 : Mask[3]) << 6;
8414 static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
8415 SelectionDAG &DAG) {
8416 return DAG.getConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
8419 /// \brief Compute whether each element of a shuffle is zeroable.
8421 /// A "zeroable" vector shuffle element is one which can be lowered to zero.
8422 /// Either it is an undef element in the shuffle mask, the element of the input
8423 /// referenced is undef, or the element of the input referenced is known to be
8424 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
8425 /// as many lanes with this technique as possible to simplify the remaining
8427 static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
8428 SDValue V1, SDValue V2) {
8429 APInt Zeroable(Mask.size(), 0);
8430 V1 = peekThroughBitcasts(V1);
8431 V2 = peekThroughBitcasts(V2);
8433 bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
8434 bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
8436 int VectorSizeInBits = V1.getValueSizeInBits();
8437 int ScalarSizeInBits = VectorSizeInBits / Mask.size();
8438 assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
8440 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8442 // Handle the easy cases.
8443 if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
8448 // Determine shuffle input and normalize the mask.
8449 SDValue V = M < Size ? V1 : V2;
8452 // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
8453 if (V.getOpcode() != ISD::BUILD_VECTOR)
8456 // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
8457 // the (larger) source element must be UNDEF/ZERO.
8458 if ((Size % V.getNumOperands()) == 0) {
8459 int Scale = Size / V->getNumOperands();
8460 SDValue Op = V.getOperand(M / Scale);
8461 if (Op.isUndef() || X86::isZeroNode(Op))
8463 else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
8464 APInt Val = Cst->getAPIntValue();
8465 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8466 Val = Val.getLoBits(ScalarSizeInBits);
8469 } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
8470 APInt Val = Cst->getValueAPF().bitcastToAPInt();
8471 Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
8472 Val = Val.getLoBits(ScalarSizeInBits);
8479 // If the BUILD_VECTOR has more elements then all the (smaller) source
8480 // elements must be UNDEF or ZERO.
8481 if ((V.getNumOperands() % Size) == 0) {
8482 int Scale = V->getNumOperands() / Size;
8483 bool AllZeroable = true;
8484 for (int j = 0; j < Scale; ++j) {
8485 SDValue Op = V.getOperand((M * Scale) + j);
8486 AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
8497 // The Shuffle result is as follow:
8498 // 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
8499 // Each Zeroable's element correspond to a particular Mask's element.
8500 // As described in computeZeroableShuffleElements function.
8502 // The function looks for a sub-mask that the nonzero elements are in
8503 // increasing order. If such sub-mask exist. The function returns true.
8504 static bool isNonZeroElementsInOrder(const APInt &Zeroable,
8505 ArrayRef<int> Mask, const EVT &VectorType,
8506 bool &IsZeroSideLeft) {
8507 int NextElement = -1;
8508 // Check if the Mask's nonzero elements are in increasing order.
8509 for (int i = 0, e = Mask.size(); i < e; i++) {
8510 // Checks if the mask's zeros elements are built from only zeros.
8511 assert(Mask[i] >= -1 && "Out of bound mask element!");
8516 // Find the lowest non zero element
8517 if (NextElement < 0) {
8518 NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
8519 IsZeroSideLeft = NextElement != 0;
8521 // Exit if the mask's non zero elements are not in increasing order.
8522 if (NextElement != Mask[i])
8529 /// Try to lower a shuffle with a single PSHUFB of V1 or V2.
8530 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
8531 ArrayRef<int> Mask, SDValue V1,
8533 const APInt &Zeroable,
8534 const X86Subtarget &Subtarget,
8535 SelectionDAG &DAG) {
8536 int Size = Mask.size();
8537 int LaneSize = 128 / VT.getScalarSizeInBits();
8538 const int NumBytes = VT.getSizeInBits() / 8;
8539 const int NumEltBytes = VT.getScalarSizeInBits() / 8;
8541 assert((Subtarget.hasSSSE3() && VT.is128BitVector()) ||
8542 (Subtarget.hasAVX2() && VT.is256BitVector()) ||
8543 (Subtarget.hasBWI() && VT.is512BitVector()));
8545 SmallVector<SDValue, 64> PSHUFBMask(NumBytes);
8546 // Sign bit set in i8 mask means zero element.
8547 SDValue ZeroMask = DAG.getConstant(0x80, DL, MVT::i8);
8550 for (int i = 0; i < NumBytes; ++i) {
8551 int M = Mask[i / NumEltBytes];
8553 PSHUFBMask[i] = DAG.getUNDEF(MVT::i8);
8556 if (Zeroable[i / NumEltBytes]) {
8557 PSHUFBMask[i] = ZeroMask;
8561 // We can only use a single input of V1 or V2.
8562 SDValue SrcV = (M >= Size ? V2 : V1);
8568 // PSHUFB can't cross lanes, ensure this doesn't happen.
8569 if ((M / LaneSize) != ((i / NumEltBytes) / LaneSize))
8573 M = M * NumEltBytes + (i % NumEltBytes);
8574 PSHUFBMask[i] = DAG.getConstant(M, DL, MVT::i8);
8576 assert(V && "Failed to find a source input");
8578 MVT I8VT = MVT::getVectorVT(MVT::i8, NumBytes);
8579 return DAG.getBitcast(
8580 VT, DAG.getNode(X86ISD::PSHUFB, DL, I8VT, DAG.getBitcast(I8VT, V),
8581 DAG.getBuildVector(I8VT, DL, PSHUFBMask)));
8584 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
8585 const X86Subtarget &Subtarget, SelectionDAG &DAG,
8588 // X86 has dedicated shuffle that can be lowered to VEXPAND
8589 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
8590 const APInt &Zeroable,
8591 ArrayRef<int> Mask, SDValue &V1,
8592 SDValue &V2, SelectionDAG &DAG,
8593 const X86Subtarget &Subtarget) {
8594 bool IsLeftZeroSide = true;
8595 if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
8598 unsigned VEXPANDMask = (~Zeroable).getZExtValue();
8600 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8601 SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
8602 unsigned NumElts = VT.getVectorNumElements();
8603 assert((NumElts == 4 || NumElts == 8 || NumElts == 16) &&
8604 "Unexpected number of vector elements");
8605 SDValue VMask = getMaskNode(MaskNode, MVT::getVectorVT(MVT::i1, NumElts),
8606 Subtarget, DAG, DL);
8607 SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL);
8608 SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1;
8609 return DAG.getSelect(DL, VT, VMask,
8610 DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector),
8614 static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
8615 unsigned &UnpackOpcode, bool IsUnary,
8616 ArrayRef<int> TargetMask, SDLoc &DL,
8618 const X86Subtarget &Subtarget) {
8619 int NumElts = VT.getVectorNumElements();
8621 bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
8622 for (int i = 0; i != NumElts; i += 2) {
8623 int M1 = TargetMask[i + 0];
8624 int M2 = TargetMask[i + 1];
8625 Undef1 &= (SM_SentinelUndef == M1);
8626 Undef2 &= (SM_SentinelUndef == M2);
8627 Zero1 &= isUndefOrZero(M1);
8628 Zero2 &= isUndefOrZero(M2);
8630 assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
8631 "Zeroable shuffle detected");
8633 // Attempt to match the target mask against the unpack lo/hi mask patterns.
8634 SmallVector<int, 64> Unpckl, Unpckh;
8635 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
8636 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8637 UnpackOpcode = X86ISD::UNPCKL;
8638 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8639 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8643 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
8644 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8645 UnpackOpcode = X86ISD::UNPCKH;
8646 V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
8647 V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
8651 // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
8652 if (IsUnary && (Zero1 || Zero2)) {
8653 // Don't bother if we can blend instead.
8654 if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
8655 isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
8658 bool MatchLo = true, MatchHi = true;
8659 for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
8660 int M = TargetMask[i];
8662 // Ignore if the input is known to be zero or the index is undef.
8663 if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
8664 (M == SM_SentinelUndef))
8667 MatchLo &= (M == Unpckl[i]);
8668 MatchHi &= (M == Unpckh[i]);
8671 if (MatchLo || MatchHi) {
8672 UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
8673 V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8674 V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
8679 // If a binary shuffle, commute and try again.
8681 ShuffleVectorSDNode::commuteMask(Unpckl);
8682 if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
8683 UnpackOpcode = X86ISD::UNPCKL;
8688 ShuffleVectorSDNode::commuteMask(Unpckh);
8689 if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
8690 UnpackOpcode = X86ISD::UNPCKH;
8699 // X86 has dedicated unpack instructions that can handle specific blend
8700 // operations: UNPCKH and UNPCKL.
8701 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
8702 ArrayRef<int> Mask, SDValue V1,
8703 SDValue V2, SelectionDAG &DAG) {
8704 SmallVector<int, 8> Unpckl;
8705 createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, /* Unary = */ false);
8706 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8707 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V1, V2);
8709 SmallVector<int, 8> Unpckh;
8710 createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, /* Unary = */ false);
8711 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8712 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V1, V2);
8714 // Commute and try again.
8715 ShuffleVectorSDNode::commuteMask(Unpckl);
8716 if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
8717 return DAG.getNode(X86ISD::UNPCKL, DL, VT, V2, V1);
8719 ShuffleVectorSDNode::commuteMask(Unpckh);
8720 if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
8721 return DAG.getNode(X86ISD::UNPCKH, DL, VT, V2, V1);
8726 /// \brief Try to emit a bitmask instruction for a shuffle.
8728 /// This handles cases where we can model a blend exactly as a bitmask due to
8729 /// one of the inputs being zeroable.
8730 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
8731 SDValue V2, ArrayRef<int> Mask,
8732 const APInt &Zeroable,
8733 SelectionDAG &DAG) {
8734 assert(!VT.isFloatingPoint() && "Floating point types are not supported");
8735 MVT EltVT = VT.getVectorElementType();
8736 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8737 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8738 SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
8740 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8743 if (Mask[i] % Size != i)
8744 return SDValue(); // Not a blend.
8746 V = Mask[i] < Size ? V1 : V2;
8747 else if (V != (Mask[i] < Size ? V1 : V2))
8748 return SDValue(); // Can only let one input through the mask.
8750 VMaskOps[i] = AllOnes;
8753 return SDValue(); // No non-zeroable elements!
8755 SDValue VMask = DAG.getBuildVector(VT, DL, VMaskOps);
8756 return DAG.getNode(ISD::AND, DL, VT, V, VMask);
8759 /// \brief Try to emit a blend instruction for a shuffle using bit math.
8761 /// This is used as a fallback approach when first class blend instructions are
8762 /// unavailable. Currently it is only suitable for integer vectors, but could
8763 /// be generalized for floating point vectors if desirable.
8764 static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
8765 SDValue V2, ArrayRef<int> Mask,
8766 SelectionDAG &DAG) {
8767 assert(VT.isInteger() && "Only supports integer vector types!");
8768 MVT EltVT = VT.getVectorElementType();
8769 SDValue Zero = DAG.getConstant(0, DL, EltVT);
8770 SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
8771 SmallVector<SDValue, 16> MaskOps;
8772 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
8773 if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
8774 return SDValue(); // Shuffled input!
8775 MaskOps.push_back(Mask[i] < Size ? AllOnes : Zero);
8778 SDValue V1Mask = DAG.getBuildVector(VT, DL, MaskOps);
8779 V1 = DAG.getNode(ISD::AND, DL, VT, V1, V1Mask);
8780 // We have to cast V2 around.
8781 MVT MaskVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
8782 V2 = DAG.getBitcast(VT, DAG.getNode(X86ISD::ANDNP, DL, MaskVT,
8783 DAG.getBitcast(MaskVT, V1Mask),
8784 DAG.getBitcast(MaskVT, V2)));
8785 return DAG.getNode(ISD::OR, DL, VT, V1, V2);
8788 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
8789 SDValue PreservedSrc,
8790 const X86Subtarget &Subtarget,
8793 static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
8794 MutableArrayRef<int> TargetMask,
8795 bool &ForceV1Zero, bool &ForceV2Zero,
8796 uint64_t &BlendMask) {
8797 bool V1IsZeroOrUndef =
8798 V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
8799 bool V2IsZeroOrUndef =
8800 V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
8803 ForceV1Zero = false, ForceV2Zero = false;
8804 assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
8806 // Attempt to generate the binary blend mask. If an input is zero then
8807 // we can use any lane.
8808 // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
8809 for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
8810 int M = TargetMask[i];
8811 if (M == SM_SentinelUndef)
8815 if (M == i + Size) {
8816 BlendMask |= 1ull << i;
8819 if (M == SM_SentinelZero) {
8820 if (V1IsZeroOrUndef) {
8825 if (V2IsZeroOrUndef) {
8827 BlendMask |= 1ull << i;
8828 TargetMask[i] = i + Size;
8837 uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
8838 uint64_t ScaledMask = 0;
8839 for (int i = 0; i != Size; ++i)
8840 if (BlendMask & (1ull << i))
8841 ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
8845 /// \brief Try to emit a blend instruction for a shuffle.
8847 /// This doesn't do any checks for the availability of instructions for blending
8848 /// these values. It relies on the availability of the X86ISD::BLENDI pattern to
8849 /// be matched in the backend with the type given. What it does check for is
8850 /// that the shuffle mask is a blend, or convertible into a blend with zero.
8851 static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
8852 SDValue V2, ArrayRef<int> Original,
8853 const APInt &Zeroable,
8854 const X86Subtarget &Subtarget,
8855 SelectionDAG &DAG) {
8856 SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
8858 uint64_t BlendMask = 0;
8859 bool ForceV1Zero = false, ForceV2Zero = false;
8860 if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
8864 // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
8866 V1 = getZeroVector(VT, Subtarget, DAG, DL);
8868 V2 = getZeroVector(VT, Subtarget, DAG, DL);
8870 switch (VT.SimpleTy) {
8875 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V2,
8876 DAG.getConstant(BlendMask, DL, MVT::i8));
8880 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8884 // If we have AVX2 it is faster to use VPBLENDD when the shuffle fits into
8885 // that instruction.
8886 if (Subtarget.hasAVX2()) {
8887 // Scale the blend by the number of 32-bit dwords per element.
8888 int Scale = VT.getScalarSizeInBits() / 32;
8889 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8890 MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
8891 V1 = DAG.getBitcast(BlendVT, V1);
8892 V2 = DAG.getBitcast(BlendVT, V2);
8893 return DAG.getBitcast(
8894 VT, DAG.getNode(X86ISD::BLENDI, DL, BlendVT, V1, V2,
8895 DAG.getConstant(BlendMask, DL, MVT::i8)));
8899 // For integer shuffles we need to expand the mask and cast the inputs to
8900 // v8i16s prior to blending.
8901 int Scale = 8 / VT.getVectorNumElements();
8902 BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
8903 V1 = DAG.getBitcast(MVT::v8i16, V1);
8904 V2 = DAG.getBitcast(MVT::v8i16, V2);
8905 return DAG.getBitcast(VT,
8906 DAG.getNode(X86ISD::BLENDI, DL, MVT::v8i16, V1, V2,
8907 DAG.getConstant(BlendMask, DL, MVT::i8)));
8911 assert(Subtarget.hasAVX2() && "256-bit integer blends require AVX2!");
8912 SmallVector<int, 8> RepeatedMask;
8913 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
8914 // We can lower these with PBLENDW which is mirrored across 128-bit lanes.
8915 assert(RepeatedMask.size() == 8 && "Repeated mask size doesn't match!");
8917 for (int i = 0; i < 8; ++i)
8918 if (RepeatedMask[i] >= 8)
8919 BlendMask |= 1ull << i;
8920 return DAG.getNode(X86ISD::BLENDI, DL, MVT::v16i16, V1, V2,
8921 DAG.getConstant(BlendMask, DL, MVT::i8));
8927 assert((VT.is128BitVector() || Subtarget.hasAVX2()) &&
8928 "256-bit byte-blends require AVX2 support!");
8930 if (Subtarget.hasBWI() && Subtarget.hasVLX()) {
8932 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8933 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8934 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8937 // Attempt to lower to a bitmask if we can. VPAND is faster than VPBLENDVB.
8938 if (SDValue Masked =
8939 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
8942 // Scale the blend by the number of bytes per element.
8943 int Scale = VT.getScalarSizeInBits() / 8;
8945 // This form of blend is always done on bytes. Compute the byte vector
8947 MVT BlendVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
8949 // Compute the VSELECT mask. Note that VSELECT is really confusing in the
8950 // mix of LLVM's code generator and the x86 backend. We tell the code
8951 // generator that boolean values in the elements of an x86 vector register
8952 // are -1 for true and 0 for false. We then use the LLVM semantics of 'true'
8953 // mapping a select to operand #1, and 'false' mapping to operand #2. The
8954 // reality in x86 is that vector masks (pre-AVX-512) use only the high bit
8955 // of the element (the remaining are ignored) and 0 in that high bit would
8956 // mean operand #1 while 1 in the high bit would mean operand #2. So while
8957 // the LLVM model for boolean values in vector elements gets the relevant
8958 // bit set, it is set backwards and over constrained relative to x86's
8960 SmallVector<SDValue, 32> VSELECTMask;
8961 for (int i = 0, Size = Mask.size(); i < Size; ++i)
8962 for (int j = 0; j < Scale; ++j)
8963 VSELECTMask.push_back(
8964 Mask[i] < 0 ? DAG.getUNDEF(MVT::i8)
8965 : DAG.getConstant(Mask[i] < Size ? -1 : 0, DL,
8968 V1 = DAG.getBitcast(BlendVT, V1);
8969 V2 = DAG.getBitcast(BlendVT, V2);
8970 return DAG.getBitcast(
8972 DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask),
8982 MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
8983 SDValue MaskNode = DAG.getConstant(BlendMask, DL, IntegerType);
8984 return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
8987 llvm_unreachable("Not a supported integer vector type!");
8991 /// \brief Try to lower as a blend of elements from two inputs followed by
8992 /// a single-input permutation.
8994 /// This matches the pattern where we can blend elements from two inputs and
8995 /// then reduce the shuffle to a single-input permutation.
8996 static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
8997 SDValue V1, SDValue V2,
8999 SelectionDAG &DAG) {
9000 // We build up the blend mask while checking whether a blend is a viable way
9001 // to reduce the shuffle.
9002 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9003 SmallVector<int, 32> PermuteMask(Mask.size(), -1);
9005 for (int i = 0, Size = Mask.size(); i < Size; ++i) {
9009 assert(Mask[i] < Size * 2 && "Shuffle input is out of bounds.");
9011 if (BlendMask[Mask[i] % Size] < 0)
9012 BlendMask[Mask[i] % Size] = Mask[i];
9013 else if (BlendMask[Mask[i] % Size] != Mask[i])
9014 return SDValue(); // Can't blend in the needed input!
9016 PermuteMask[i] = Mask[i] % Size;
9019 SDValue V = DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9020 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
9023 /// \brief Generic routine to decompose a shuffle and blend into independent
9024 /// blends and permutes.
9026 /// This matches the extremely common pattern for handling combined
9027 /// shuffle+blend operations on newer X86 ISAs where we have very fast blend
9028 /// operations. It will try to pick the best arrangement of shuffles and
9030 static SDValue lowerVectorShuffleAsDecomposedShuffleBlend(const SDLoc &DL,
9034 SelectionDAG &DAG) {
9035 // Shuffle the input elements into the desired positions in V1 and V2 and
9036 // blend them together.
9037 SmallVector<int, 32> V1Mask(Mask.size(), -1);
9038 SmallVector<int, 32> V2Mask(Mask.size(), -1);
9039 SmallVector<int, 32> BlendMask(Mask.size(), -1);
9040 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9041 if (Mask[i] >= 0 && Mask[i] < Size) {
9042 V1Mask[i] = Mask[i];
9044 } else if (Mask[i] >= Size) {
9045 V2Mask[i] = Mask[i] - Size;
9046 BlendMask[i] = i + Size;
9049 // Try to lower with the simpler initial blend strategy unless one of the
9050 // input shuffles would be a no-op. We prefer to shuffle inputs as the
9051 // shuffle may be able to fold with a load or other benefit. However, when
9052 // we'll have to do 2x as many shuffles in order to achieve this, blending
9053 // first is a better strategy.
9054 if (!isNoopShuffleMask(V1Mask) && !isNoopShuffleMask(V2Mask))
9055 if (SDValue BlendPerm =
9056 lowerVectorShuffleAsBlendAndPermute(DL, VT, V1, V2, Mask, DAG))
9059 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
9060 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
9061 return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
9064 /// \brief Try to lower a vector shuffle as a rotation.
9066 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
9067 static int matchVectorShuffleAsRotate(SDValue &V1, SDValue &V2,
9068 ArrayRef<int> Mask) {
9069 int NumElts = Mask.size();
9071 // We need to detect various ways of spelling a rotation:
9072 // [11, 12, 13, 14, 15, 0, 1, 2]
9073 // [-1, 12, 13, 14, -1, -1, 1, -1]
9074 // [-1, -1, -1, -1, -1, -1, 1, 2]
9075 // [ 3, 4, 5, 6, 7, 8, 9, 10]
9076 // [-1, 4, 5, 6, -1, -1, 9, -1]
9077 // [-1, 4, 5, 6, -1, -1, -1, -1]
9080 for (int i = 0; i < NumElts; ++i) {
9082 assert((M == SM_SentinelUndef || (0 <= M && M < (2*NumElts))) &&
9083 "Unexpected mask index.");
9087 // Determine where a rotated vector would have started.
9088 int StartIdx = i - (M % NumElts);
9090 // The identity rotation isn't interesting, stop.
9093 // If we found the tail of a vector the rotation must be the missing
9094 // front. If we found the head of a vector, it must be how much of the
9096 int CandidateRotation = StartIdx < 0 ? -StartIdx : NumElts - StartIdx;
9099 Rotation = CandidateRotation;
9100 else if (Rotation != CandidateRotation)
9101 // The rotations don't match, so we can't match this mask.
9104 // Compute which value this mask is pointing at.
9105 SDValue MaskV = M < NumElts ? V1 : V2;
9107 // Compute which of the two target values this index should be assigned
9108 // to. This reflects whether the high elements are remaining or the low
9109 // elements are remaining.
9110 SDValue &TargetV = StartIdx < 0 ? Hi : Lo;
9112 // Either set up this value if we've not encountered it before, or check
9113 // that it remains consistent.
9116 else if (TargetV != MaskV)
9117 // This may be a rotation, but it pulls from the inputs in some
9118 // unsupported interleaving.
9122 // Check that we successfully analyzed the mask, and normalize the results.
9123 assert(Rotation != 0 && "Failed to locate a viable rotation!");
9124 assert((Lo || Hi) && "Failed to find a rotated input vector!");
9136 /// \brief Try to lower a vector shuffle as a byte rotation.
9138 /// SSSE3 has a generic PALIGNR instruction in x86 that will do an arbitrary
9139 /// byte-rotation of the concatenation of two vectors; pre-SSSE3 can use
9140 /// a PSRLDQ/PSLLDQ/POR pattern to get a similar effect. This routine will
9141 /// try to generically lower a vector shuffle through such an pattern. It
9142 /// does not check for the profitability of lowering either as PALIGNR or
9143 /// PSRLDQ/PSLLDQ/POR, only whether the mask is valid to lower in that form.
9144 /// This matches shuffle vectors that look like:
9146 /// v8i16 [11, 12, 13, 14, 15, 0, 1, 2]
9148 /// Essentially it concatenates V1 and V2, shifts right by some number of
9149 /// elements, and takes the low elements as the result. Note that while this is
9150 /// specified as a *right shift* because x86 is little-endian, it is a *left
9151 /// rotate* of the vector lanes.
9152 static int matchVectorShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
9153 ArrayRef<int> Mask) {
9154 // Don't accept any shuffles with zero elements.
9155 if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
9158 // PALIGNR works on 128-bit lanes.
9159 SmallVector<int, 16> RepeatedMask;
9160 if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
9163 int Rotation = matchVectorShuffleAsRotate(V1, V2, RepeatedMask);
9167 // PALIGNR rotates bytes, so we need to scale the
9168 // rotation based on how many bytes are in the vector lane.
9169 int NumElts = RepeatedMask.size();
9170 int Scale = 16 / NumElts;
9171 return Rotation * Scale;
9174 static SDValue lowerVectorShuffleAsByteRotate(const SDLoc &DL, MVT VT,
9175 SDValue V1, SDValue V2,
9177 const X86Subtarget &Subtarget,
9178 SelectionDAG &DAG) {
9179 assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
9181 SDValue Lo = V1, Hi = V2;
9182 int ByteRotation = matchVectorShuffleAsByteRotate(VT, Lo, Hi, Mask);
9183 if (ByteRotation <= 0)
9186 // Cast the inputs to i8 vector of correct length to match PALIGNR or
9188 MVT ByteVT = MVT::getVectorVT(MVT::i8, VT.getSizeInBits() / 8);
9189 Lo = DAG.getBitcast(ByteVT, Lo);
9190 Hi = DAG.getBitcast(ByteVT, Hi);
9192 // SSSE3 targets can use the palignr instruction.
9193 if (Subtarget.hasSSSE3()) {
9194 assert((!VT.is512BitVector() || Subtarget.hasBWI()) &&
9195 "512-bit PALIGNR requires BWI instructions");
9196 return DAG.getBitcast(
9197 VT, DAG.getNode(X86ISD::PALIGNR, DL, ByteVT, Lo, Hi,
9198 DAG.getConstant(ByteRotation, DL, MVT::i8)));
9201 assert(VT.is128BitVector() &&
9202 "Rotate-based lowering only supports 128-bit lowering!");
9203 assert(Mask.size() <= 16 &&
9204 "Can shuffle at most 16 bytes in a 128-bit vector!");
9205 assert(ByteVT == MVT::v16i8 &&
9206 "SSE2 rotate lowering only needed for v16i8!");
9208 // Default SSE2 implementation
9209 int LoByteShift = 16 - ByteRotation;
9210 int HiByteShift = ByteRotation;
9212 SDValue LoShift = DAG.getNode(X86ISD::VSHLDQ, DL, MVT::v16i8, Lo,
9213 DAG.getConstant(LoByteShift, DL, MVT::i8));
9214 SDValue HiShift = DAG.getNode(X86ISD::VSRLDQ, DL, MVT::v16i8, Hi,
9215 DAG.getConstant(HiByteShift, DL, MVT::i8));
9216 return DAG.getBitcast(VT,
9217 DAG.getNode(ISD::OR, DL, MVT::v16i8, LoShift, HiShift));
9220 /// \brief Try to lower a vector shuffle as a dword/qword rotation.
9222 /// AVX512 has a VALIGND/VALIGNQ instructions that will do an arbitrary
9223 /// rotation of the concatenation of two vectors; This routine will
9224 /// try to generically lower a vector shuffle through such an pattern.
9226 /// Essentially it concatenates V1 and V2, shifts right by some number of
9227 /// elements, and takes the low elements as the result. Note that while this is
9228 /// specified as a *right shift* because x86 is little-endian, it is a *left
9229 /// rotate* of the vector lanes.
9230 static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
9231 SDValue V1, SDValue V2,
9233 const X86Subtarget &Subtarget,
9234 SelectionDAG &DAG) {
9235 assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
9236 "Only 32-bit and 64-bit elements are supported!");
9238 // 128/256-bit vectors are only supported with VLX.
9239 assert((Subtarget.hasVLX() || (!VT.is128BitVector() && !VT.is256BitVector()))
9240 && "VLX required for 128/256-bit vectors");
9242 SDValue Lo = V1, Hi = V2;
9243 int Rotation = matchVectorShuffleAsRotate(Lo, Hi, Mask);
9247 return DAG.getNode(X86ISD::VALIGN, DL, VT, Lo, Hi,
9248 DAG.getConstant(Rotation, DL, MVT::i8));
9251 /// \brief Try to lower a vector shuffle as a bit shift (shifts in zeros).
9253 /// Attempts to match a shuffle mask against the PSLL(W/D/Q/DQ) and
9254 /// PSRL(W/D/Q/DQ) SSE2 and AVX2 logical bit-shift instructions. The function
9255 /// matches elements from one of the input vectors shuffled to the left or
9256 /// right with zeroable elements 'shifted in'. It handles both the strictly
9257 /// bit-wise element shifts and the byte shift across an entire 128-bit double
9260 /// PSHL : (little-endian) left bit shift.
9261 /// [ zz, 0, zz, 2 ]
9262 /// [ -1, 4, zz, -1 ]
9263 /// PSRL : (little-endian) right bit shift.
9265 /// [ -1, -1, 7, zz]
9266 /// PSLLDQ : (little-endian) left byte shift
9267 /// [ zz, 0, 1, 2, 3, 4, 5, 6]
9268 /// [ zz, zz, -1, -1, 2, 3, 4, -1]
9269 /// [ zz, zz, zz, zz, zz, zz, -1, 1]
9270 /// PSRLDQ : (little-endian) right byte shift
9271 /// [ 5, 6, 7, zz, zz, zz, zz, zz]
9272 /// [ -1, 5, 6, 7, zz, zz, zz, zz]
9273 /// [ 1, 2, -1, -1, -1, -1, zz, zz]
9274 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
9275 unsigned ScalarSizeInBits,
9276 ArrayRef<int> Mask, int MaskOffset,
9277 const APInt &Zeroable,
9278 const X86Subtarget &Subtarget) {
9279 int Size = Mask.size();
9280 unsigned SizeInBits = Size * ScalarSizeInBits;
9282 auto CheckZeros = [&](int Shift, int Scale, bool Left) {
9283 for (int i = 0; i < Size; i += Scale)
9284 for (int j = 0; j < Shift; ++j)
9285 if (!Zeroable[i + j + (Left ? 0 : (Scale - Shift))])
9291 auto MatchShift = [&](int Shift, int Scale, bool Left) {
9292 for (int i = 0; i != Size; i += Scale) {
9293 unsigned Pos = Left ? i + Shift : i;
9294 unsigned Low = Left ? i : i + Shift;
9295 unsigned Len = Scale - Shift;
9296 if (!isSequentialOrUndefInRange(Mask, Pos, Len, Low + MaskOffset))
9300 int ShiftEltBits = ScalarSizeInBits * Scale;
9301 bool ByteShift = ShiftEltBits > 64;
9302 Opcode = Left ? (ByteShift ? X86ISD::VSHLDQ : X86ISD::VSHLI)
9303 : (ByteShift ? X86ISD::VSRLDQ : X86ISD::VSRLI);
9304 int ShiftAmt = Shift * ScalarSizeInBits / (ByteShift ? 8 : 1);
9306 // Normalize the scale for byte shifts to still produce an i64 element
9308 Scale = ByteShift ? Scale / 2 : Scale;
9310 // We need to round trip through the appropriate type for the shift.
9311 MVT ShiftSVT = MVT::getIntegerVT(ScalarSizeInBits * Scale);
9312 ShiftVT = ByteShift ? MVT::getVectorVT(MVT::i8, SizeInBits / 8)
9313 : MVT::getVectorVT(ShiftSVT, Size / Scale);
9314 return (int)ShiftAmt;
9317 // SSE/AVX supports logical shifts up to 64-bit integers - so we can just
9318 // keep doubling the size of the integer elements up to that. We can
9319 // then shift the elements of the integer vector by whole multiples of
9320 // their width within the elements of the larger integer vector. Test each
9321 // multiple to see if we can find a match with the moved element indices
9322 // and that the shifted in elements are all zeroable.
9323 unsigned MaxWidth = ((SizeInBits == 512) && !Subtarget.hasBWI() ? 64 : 128);
9324 for (int Scale = 2; Scale * ScalarSizeInBits <= MaxWidth; Scale *= 2)
9325 for (int Shift = 1; Shift != Scale; ++Shift)
9326 for (bool Left : {true, false})
9327 if (CheckZeros(Shift, Scale, Left)) {
9328 int ShiftAmt = MatchShift(Shift, Scale, Left);
9337 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
9338 SDValue V2, ArrayRef<int> Mask,
9339 const APInt &Zeroable,
9340 const X86Subtarget &Subtarget,
9341 SelectionDAG &DAG) {
9342 int Size = Mask.size();
9343 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9349 // Try to match shuffle against V1 shift.
9350 int ShiftAmt = matchVectorShuffleAsShift(
9351 ShiftVT, Opcode, VT.getScalarSizeInBits(), Mask, 0, Zeroable, Subtarget);
9353 // If V1 failed, try to match shuffle against V2 shift.
9356 matchVectorShuffleAsShift(ShiftVT, Opcode, VT.getScalarSizeInBits(),
9357 Mask, Size, Zeroable, Subtarget);
9364 assert(DAG.getTargetLoweringInfo().isTypeLegal(ShiftVT) &&
9365 "Illegal integer vector type");
9366 V = DAG.getBitcast(ShiftVT, V);
9367 V = DAG.getNode(Opcode, DL, ShiftVT, V,
9368 DAG.getConstant(ShiftAmt, DL, MVT::i8));
9369 return DAG.getBitcast(VT, V);
9372 // EXTRQ: Extract Len elements from lower half of source, starting at Idx.
9373 // Remainder of lower half result is zero and upper half is all undef.
9374 static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2,
9375 ArrayRef<int> Mask, uint64_t &BitLen,
9376 uint64_t &BitIdx, const APInt &Zeroable) {
9377 int Size = Mask.size();
9378 int HalfSize = Size / 2;
9379 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9380 assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
9382 // Upper half must be undefined.
9383 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9386 // Determine the extraction length from the part of the
9387 // lower half that isn't zeroable.
9389 for (; Len > 0; --Len)
9390 if (!Zeroable[Len - 1])
9392 assert(Len > 0 && "Zeroable shuffle mask");
9394 // Attempt to match first Len sequential elements from the lower half.
9397 for (int i = 0; i != Len; ++i) {
9399 if (M == SM_SentinelUndef)
9401 SDValue &V = (M < Size ? V1 : V2);
9404 // The extracted elements must start at a valid index and all mask
9405 // elements must be in the lower half.
9406 if (i > M || M >= HalfSize)
9409 if (Idx < 0 || (Src == V && Idx == (M - i))) {
9417 if (!Src || Idx < 0)
9420 assert((Idx + Len) <= HalfSize && "Illegal extraction mask");
9421 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9422 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9427 // INSERTQ: Extract lowest Len elements from lower half of second source and
9428 // insert over first source, starting at Idx.
9429 // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... }
9430 static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2,
9431 ArrayRef<int> Mask, uint64_t &BitLen,
9433 int Size = Mask.size();
9434 int HalfSize = Size / 2;
9435 assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
9437 // Upper half must be undefined.
9438 if (!isUndefInRange(Mask, HalfSize, HalfSize))
9441 for (int Idx = 0; Idx != HalfSize; ++Idx) {
9444 // Attempt to match first source from mask before insertion point.
9445 if (isUndefInRange(Mask, 0, Idx)) {
9447 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) {
9449 } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) {
9455 // Extend the extraction length looking to match both the insertion of
9456 // the second source and the remaining elements of the first.
9457 for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) {
9462 if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) {
9464 } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) {
9470 // Match the remaining elements of the lower half.
9471 if (isUndefInRange(Mask, Hi, HalfSize - Hi)) {
9473 } else if ((!Base || (Base == V1)) &&
9474 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) {
9476 } else if ((!Base || (Base == V2)) &&
9477 isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi,
9484 BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f;
9485 BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f;
9495 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
9496 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
9497 SDValue V2, ArrayRef<int> Mask,
9498 const APInt &Zeroable,
9499 SelectionDAG &DAG) {
9500 uint64_t BitLen, BitIdx;
9501 if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable))
9502 return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1,
9503 DAG.getConstant(BitLen, DL, MVT::i8),
9504 DAG.getConstant(BitIdx, DL, MVT::i8));
9506 if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx))
9507 return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT),
9508 V2 ? V2 : DAG.getUNDEF(VT),
9509 DAG.getConstant(BitLen, DL, MVT::i8),
9510 DAG.getConstant(BitIdx, DL, MVT::i8));
9515 /// \brief Lower a vector shuffle as a zero or any extension.
9517 /// Given a specific number of elements, element bit width, and extension
9518 /// stride, produce either a zero or any extension based on the available
9519 /// features of the subtarget. The extended elements are consecutive and
9520 /// begin and can start from an offsetted element index in the input; to
9521 /// avoid excess shuffling the offset must either being in the bottom lane
9522 /// or at the start of a higher lane. All extended elements must be from
9524 static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9525 const SDLoc &DL, MVT VT, int Scale, int Offset, bool AnyExt, SDValue InputV,
9526 ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
9527 assert(Scale > 1 && "Need a scale to extend.");
9528 int EltBits = VT.getScalarSizeInBits();
9529 int NumElements = VT.getVectorNumElements();
9530 int NumEltsPerLane = 128 / EltBits;
9531 int OffsetLane = Offset / NumEltsPerLane;
9532 assert((EltBits == 8 || EltBits == 16 || EltBits == 32) &&
9533 "Only 8, 16, and 32 bit elements can be extended.");
9534 assert(Scale * EltBits <= 64 && "Cannot zero extend past 64 bits.");
9535 assert(0 <= Offset && "Extension offset must be positive.");
9536 assert((Offset < NumEltsPerLane || Offset % NumEltsPerLane == 0) &&
9537 "Extension offset must be in the first lane or start an upper lane.");
9539 // Check that an index is in same lane as the base offset.
9540 auto SafeOffset = [&](int Idx) {
9541 return OffsetLane == (Idx / NumEltsPerLane);
9544 // Shift along an input so that the offset base moves to the first element.
9545 auto ShuffleOffset = [&](SDValue V) {
9549 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9550 for (int i = 0; i * Scale < NumElements; ++i) {
9551 int SrcIdx = i + Offset;
9552 ShMask[i] = SafeOffset(SrcIdx) ? SrcIdx : -1;
9554 return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), ShMask);
9557 // Found a valid zext mask! Try various lowering strategies based on the
9558 // input type and available ISA extensions.
9559 if (Subtarget.hasSSE41()) {
9560 // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
9561 // PUNPCK will catch this in a later shuffle match.
9562 if (Offset && Scale == 2 && VT.is128BitVector())
9564 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
9565 NumElements / Scale);
9566 InputV = ShuffleOffset(InputV);
9567 InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
9568 return DAG.getBitcast(VT, InputV);
9571 assert(VT.is128BitVector() && "Only 128-bit vectors can be extended.");
9573 // For any extends we can cheat for larger element sizes and use shuffle
9574 // instructions that can fold with a load and/or copy.
9575 if (AnyExt && EltBits == 32) {
9576 int PSHUFDMask[4] = {Offset, -1, SafeOffset(Offset + 1) ? Offset + 1 : -1,
9578 return DAG.getBitcast(
9579 VT, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9580 DAG.getBitcast(MVT::v4i32, InputV),
9581 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
9583 if (AnyExt && EltBits == 16 && Scale > 2) {
9584 int PSHUFDMask[4] = {Offset / 2, -1,
9585 SafeOffset(Offset + 1) ? (Offset + 1) / 2 : -1, -1};
9586 InputV = DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32,
9587 DAG.getBitcast(MVT::v4i32, InputV),
9588 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
9589 int PSHUFWMask[4] = {1, -1, -1, -1};
9590 unsigned OddEvenOp = (Offset & 1 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW);
9591 return DAG.getBitcast(
9592 VT, DAG.getNode(OddEvenOp, DL, MVT::v8i16,
9593 DAG.getBitcast(MVT::v8i16, InputV),
9594 getV4X86ShuffleImm8ForMask(PSHUFWMask, DL, DAG)));
9597 // The SSE4A EXTRQ instruction can efficiently extend the first 2 lanes
9599 if ((Scale * EltBits) == 64 && EltBits < 32 && Subtarget.hasSSE4A()) {
9600 assert(NumElements == (int)Mask.size() && "Unexpected shuffle mask size!");
9601 assert(VT.is128BitVector() && "Unexpected vector width!");
9603 int LoIdx = Offset * EltBits;
9604 SDValue Lo = DAG.getBitcast(
9605 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9606 DAG.getConstant(EltBits, DL, MVT::i8),
9607 DAG.getConstant(LoIdx, DL, MVT::i8)));
9609 if (isUndefInRange(Mask, NumElements / 2, NumElements / 2) ||
9610 !SafeOffset(Offset + 1))
9611 return DAG.getBitcast(VT, Lo);
9613 int HiIdx = (Offset + 1) * EltBits;
9614 SDValue Hi = DAG.getBitcast(
9615 MVT::v2i64, DAG.getNode(X86ISD::EXTRQI, DL, VT, InputV,
9616 DAG.getConstant(EltBits, DL, MVT::i8),
9617 DAG.getConstant(HiIdx, DL, MVT::i8)));
9618 return DAG.getBitcast(VT,
9619 DAG.getNode(X86ISD::UNPCKL, DL, MVT::v2i64, Lo, Hi));
9622 // If this would require more than 2 unpack instructions to expand, use
9623 // pshufb when available. We can only use more than 2 unpack instructions
9624 // when zero extending i8 elements which also makes it easier to use pshufb.
9625 if (Scale > 4 && EltBits == 8 && Subtarget.hasSSSE3()) {
9626 assert(NumElements == 16 && "Unexpected byte vector width!");
9627 SDValue PSHUFBMask[16];
9628 for (int i = 0; i < 16; ++i) {
9629 int Idx = Offset + (i / Scale);
9630 PSHUFBMask[i] = DAG.getConstant(
9631 (i % Scale == 0 && SafeOffset(Idx)) ? Idx : 0x80, DL, MVT::i8);
9633 InputV = DAG.getBitcast(MVT::v16i8, InputV);
9634 return DAG.getBitcast(
9635 VT, DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8, InputV,
9636 DAG.getBuildVector(MVT::v16i8, DL, PSHUFBMask)));
9639 // If we are extending from an offset, ensure we start on a boundary that
9640 // we can unpack from.
9641 int AlignToUnpack = Offset % (NumElements / Scale);
9642 if (AlignToUnpack) {
9643 SmallVector<int, 8> ShMask((unsigned)NumElements, -1);
9644 for (int i = AlignToUnpack; i < NumElements; ++i)
9645 ShMask[i - AlignToUnpack] = i;
9646 InputV = DAG.getVectorShuffle(VT, DL, InputV, DAG.getUNDEF(VT), ShMask);
9647 Offset -= AlignToUnpack;
9650 // Otherwise emit a sequence of unpacks.
9652 unsigned UnpackLoHi = X86ISD::UNPCKL;
9653 if (Offset >= (NumElements / 2)) {
9654 UnpackLoHi = X86ISD::UNPCKH;
9655 Offset -= (NumElements / 2);
9658 MVT InputVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits), NumElements);
9659 SDValue Ext = AnyExt ? DAG.getUNDEF(InputVT)
9660 : getZeroVector(InputVT, Subtarget, DAG, DL);
9661 InputV = DAG.getBitcast(InputVT, InputV);
9662 InputV = DAG.getNode(UnpackLoHi, DL, InputVT, InputV, Ext);
9666 } while (Scale > 1);
9667 return DAG.getBitcast(VT, InputV);
9670 /// \brief Try to lower a vector shuffle as a zero extension on any microarch.
9672 /// This routine will try to do everything in its power to cleverly lower
9673 /// a shuffle which happens to match the pattern of a zero extend. It doesn't
9674 /// check for the profitability of this lowering, it tries to aggressively
9675 /// match this pattern. It will use all of the micro-architectural details it
9676 /// can to emit an efficient lowering. It handles both blends with all-zero
9677 /// inputs to explicitly zero-extend and undef-lanes (sometimes undef due to
9678 /// masking out later).
9680 /// The reason we have dedicated lowering for zext-style shuffles is that they
9681 /// are both incredibly common and often quite performance sensitive.
9682 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
9683 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9684 const APInt &Zeroable, const X86Subtarget &Subtarget,
9685 SelectionDAG &DAG) {
9686 int Bits = VT.getSizeInBits();
9687 int NumLanes = Bits / 128;
9688 int NumElements = VT.getVectorNumElements();
9689 int NumEltsPerLane = NumElements / NumLanes;
9690 assert(VT.getScalarSizeInBits() <= 32 &&
9691 "Exceeds 32-bit integer zero extension limit");
9692 assert((int)Mask.size() == NumElements && "Unexpected shuffle mask size");
9694 // Define a helper function to check a particular ext-scale and lower to it if
9696 auto Lower = [&](int Scale) -> SDValue {
9701 for (int i = 0; i < NumElements; ++i) {
9704 continue; // Valid anywhere but doesn't tell us anything.
9705 if (i % Scale != 0) {
9706 // Each of the extended elements need to be zeroable.
9710 // We no longer are in the anyext case.
9715 // Each of the base elements needs to be consecutive indices into the
9716 // same input vector.
9717 SDValue V = M < NumElements ? V1 : V2;
9718 M = M % NumElements;
9721 Offset = M - (i / Scale);
9722 } else if (InputV != V)
9723 return SDValue(); // Flip-flopping inputs.
9725 // Offset must start in the lowest 128-bit lane or at the start of an
9727 // FIXME: Is it ever worth allowing a negative base offset?
9728 if (!((0 <= Offset && Offset < NumEltsPerLane) ||
9729 (Offset % NumEltsPerLane) == 0))
9732 // If we are offsetting, all referenced entries must come from the same
9734 if (Offset && (Offset / NumEltsPerLane) != (M / NumEltsPerLane))
9737 if ((M % NumElements) != (Offset + (i / Scale)))
9738 return SDValue(); // Non-consecutive strided elements.
9742 // If we fail to find an input, we have a zero-shuffle which should always
9743 // have already been handled.
9744 // FIXME: Maybe handle this here in case during blending we end up with one?
9748 // If we are offsetting, don't extend if we only match a single input, we
9749 // can always do better by using a basic PSHUF or PUNPCK.
9750 if (Offset != 0 && Matches < 2)
9753 return lowerVectorShuffleAsSpecificZeroOrAnyExtend(
9754 DL, VT, Scale, Offset, AnyExt, InputV, Mask, Subtarget, DAG);
9757 // The widest scale possible for extending is to a 64-bit integer.
9758 assert(Bits % 64 == 0 &&
9759 "The number of bits in a vector must be divisible by 64 on x86!");
9760 int NumExtElements = Bits / 64;
9762 // Each iteration, try extending the elements half as much, but into twice as
9764 for (; NumExtElements < NumElements; NumExtElements *= 2) {
9765 assert(NumElements % NumExtElements == 0 &&
9766 "The input vector size must be divisible by the extended size.");
9767 if (SDValue V = Lower(NumElements / NumExtElements))
9771 // General extends failed, but 128-bit vectors may be able to use MOVQ.
9775 // Returns one of the source operands if the shuffle can be reduced to a
9776 // MOVQ, copying the lower 64-bits and zero-extending to the upper 64-bits.
9777 auto CanZExtLowHalf = [&]() {
9778 for (int i = NumElements / 2; i != NumElements; ++i)
9781 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, 0))
9783 if (isSequentialOrUndefInRange(Mask, 0, NumElements / 2, NumElements))
9788 if (SDValue V = CanZExtLowHalf()) {
9789 V = DAG.getBitcast(MVT::v2i64, V);
9790 V = DAG.getNode(X86ISD::VZEXT_MOVL, DL, MVT::v2i64, V);
9791 return DAG.getBitcast(VT, V);
9794 // No viable ext lowering found.
9798 /// \brief Try to get a scalar value for a specific element of a vector.
9800 /// Looks through BUILD_VECTOR and SCALAR_TO_VECTOR nodes to find a scalar.
9801 static SDValue getScalarValueForVectorElement(SDValue V, int Idx,
9802 SelectionDAG &DAG) {
9803 MVT VT = V.getSimpleValueType();
9804 MVT EltVT = VT.getVectorElementType();
9805 V = peekThroughBitcasts(V);
9807 // If the bitcasts shift the element size, we can't extract an equivalent
9809 MVT NewVT = V.getSimpleValueType();
9810 if (!NewVT.isVector() || NewVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
9813 if (V.getOpcode() == ISD::BUILD_VECTOR ||
9814 (Idx == 0 && V.getOpcode() == ISD::SCALAR_TO_VECTOR)) {
9815 // Ensure the scalar operand is the same size as the destination.
9816 // FIXME: Add support for scalar truncation where possible.
9817 SDValue S = V.getOperand(Idx);
9818 if (EltVT.getSizeInBits() == S.getSimpleValueType().getSizeInBits())
9819 return DAG.getBitcast(EltVT, S);
9825 /// \brief Helper to test for a load that can be folded with x86 shuffles.
9827 /// This is particularly important because the set of instructions varies
9828 /// significantly based on whether the operand is a load or not.
9829 static bool isShuffleFoldableLoad(SDValue V) {
9830 V = peekThroughBitcasts(V);
9831 return ISD::isNON_EXTLoad(V.getNode());
9834 /// \brief Try to lower insertion of a single element into a zero vector.
9836 /// This is a common pattern that we have especially efficient patterns to lower
9837 /// across all subtarget feature sets.
9838 static SDValue lowerVectorShuffleAsElementInsertion(
9839 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
9840 const APInt &Zeroable, const X86Subtarget &Subtarget,
9841 SelectionDAG &DAG) {
9843 MVT EltVT = VT.getVectorElementType();
9846 find_if(Mask, [&Mask](int M) { return M >= (int)Mask.size(); }) -
9848 bool IsV1Zeroable = true;
9849 for (int i = 0, Size = Mask.size(); i < Size; ++i)
9850 if (i != V2Index && !Zeroable[i]) {
9851 IsV1Zeroable = false;
9855 // Check for a single input from a SCALAR_TO_VECTOR node.
9856 // FIXME: All of this should be canonicalized into INSERT_VECTOR_ELT and
9857 // all the smarts here sunk into that routine. However, the current
9858 // lowering of BUILD_VECTOR makes that nearly impossible until the old
9859 // vector shuffle lowering is dead.
9860 SDValue V2S = getScalarValueForVectorElement(V2, Mask[V2Index] - Mask.size(),
9862 if (V2S && DAG.getTargetLoweringInfo().isTypeLegal(V2S.getValueType())) {
9863 // We need to zext the scalar if it is smaller than an i32.
9864 V2S = DAG.getBitcast(EltVT, V2S);
9865 if (EltVT == MVT::i8 || EltVT == MVT::i16) {
9866 // Using zext to expand a narrow element won't work for non-zero
9871 // Zero-extend directly to i32.
9873 V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
9875 V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
9876 } else if (Mask[V2Index] != (int)Mask.size() || EltVT == MVT::i8 ||
9877 EltVT == MVT::i16) {
9878 // Either not inserting from the low element of the input or the input
9879 // element size is too small to use VZEXT_MOVL to clear the high bits.
9883 if (!IsV1Zeroable) {
9884 // If V1 can't be treated as a zero vector we have fewer options to lower
9885 // this. We can't support integer vectors or non-zero targets cheaply, and
9886 // the V1 elements can't be permuted in any way.
9887 assert(VT == ExtVT && "Cannot change extended type when non-zeroable!");
9888 if (!VT.isFloatingPoint() || V2Index != 0)
9890 SmallVector<int, 8> V1Mask(Mask.begin(), Mask.end());
9891 V1Mask[V2Index] = -1;
9892 if (!isNoopShuffleMask(V1Mask))
9894 // This is essentially a special case blend operation, but if we have
9895 // general purpose blend operations, they are always faster. Bail and let
9896 // the rest of the lowering handle these as blends.
9897 if (Subtarget.hasSSE41())
9900 // Otherwise, use MOVSD or MOVSS.
9901 assert((EltVT == MVT::f32 || EltVT == MVT::f64) &&
9902 "Only two types of floating point element types to handle!");
9903 return DAG.getNode(EltVT == MVT::f32 ? X86ISD::MOVSS : X86ISD::MOVSD, DL,
9907 // This lowering only works for the low element with floating point vectors.
9908 if (VT.isFloatingPoint() && V2Index != 0)
9911 V2 = DAG.getNode(X86ISD::VZEXT_MOVL, DL, ExtVT, V2);
9913 V2 = DAG.getBitcast(VT, V2);
9916 // If we have 4 or fewer lanes we can cheaply shuffle the element into
9917 // the desired position. Otherwise it is more efficient to do a vector
9918 // shift left. We know that we can do a vector shift left because all
9919 // the inputs are zero.
9920 if (VT.isFloatingPoint() || VT.getVectorNumElements() <= 4) {
9921 SmallVector<int, 4> V2Shuffle(Mask.size(), 1);
9922 V2Shuffle[V2Index] = 0;
9923 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Shuffle);
9925 V2 = DAG.getBitcast(MVT::v16i8, V2);
9927 X86ISD::VSHLDQ, DL, MVT::v16i8, V2,
9928 DAG.getConstant(V2Index * EltVT.getSizeInBits() / 8, DL,
9929 DAG.getTargetLoweringInfo().getScalarShiftAmountTy(
9930 DAG.getDataLayout(), VT)));
9931 V2 = DAG.getBitcast(VT, V2);
9937 /// Try to lower broadcast of a single - truncated - integer element,
9938 /// coming from a scalar_to_vector/build_vector node \p V0 with larger elements.
9940 /// This assumes we have AVX2.
9941 static SDValue lowerVectorShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT,
9942 SDValue V0, int BroadcastIdx,
9943 const X86Subtarget &Subtarget,
9944 SelectionDAG &DAG) {
9945 assert(Subtarget.hasAVX2() &&
9946 "We can only lower integer broadcasts with AVX2!");
9948 EVT EltVT = VT.getVectorElementType();
9949 EVT V0VT = V0.getValueType();
9951 assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
9952 assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
9954 EVT V0EltVT = V0VT.getVectorElementType();
9955 if (!V0EltVT.isInteger())
9958 const unsigned EltSize = EltVT.getSizeInBits();
9959 const unsigned V0EltSize = V0EltVT.getSizeInBits();
9961 // This is only a truncation if the original element type is larger.
9962 if (V0EltSize <= EltSize)
9965 assert(((V0EltSize % EltSize) == 0) &&
9966 "Scalar type sizes must all be powers of 2 on x86!");
9968 const unsigned V0Opc = V0.getOpcode();
9969 const unsigned Scale = V0EltSize / EltSize;
9970 const unsigned V0BroadcastIdx = BroadcastIdx / Scale;
9972 if ((V0Opc != ISD::SCALAR_TO_VECTOR || V0BroadcastIdx != 0) &&
9973 V0Opc != ISD::BUILD_VECTOR)
9976 SDValue Scalar = V0.getOperand(V0BroadcastIdx);
9978 // If we're extracting non-least-significant bits, shift so we can truncate.
9979 // Hopefully, we can fold away the trunc/srl/load into the broadcast.
9980 // Even if we can't (and !isShuffleFoldableLoad(Scalar)), prefer
9981 // vpbroadcast+vmovd+shr to vpshufb(m)+vmovd.
9982 if (const int OffsetIdx = BroadcastIdx % Scale)
9983 Scalar = DAG.getNode(ISD::SRL, DL, Scalar.getValueType(), Scalar,
9984 DAG.getConstant(OffsetIdx * EltSize, DL, Scalar.getValueType()));
9986 return DAG.getNode(X86ISD::VBROADCAST, DL, VT,
9987 DAG.getNode(ISD::TRUNCATE, DL, EltVT, Scalar));
9990 /// \brief Try to lower broadcast of a single element.
9992 /// For convenience, this code also bundles all of the subtarget feature set
9993 /// filtering. While a little annoying to re-dispatch on type here, there isn't
9994 /// a convenient way to factor it out.
9995 static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
9996 SDValue V1, SDValue V2,
9998 const X86Subtarget &Subtarget,
9999 SelectionDAG &DAG) {
10000 if (!((Subtarget.hasSSE3() && VT == MVT::v2f64) ||
10001 (Subtarget.hasAVX() && VT.isFloatingPoint()) ||
10002 (Subtarget.hasAVX2() && VT.isInteger())))
10005 // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
10006 // we can only broadcast from a register with AVX2.
10007 unsigned NumElts = Mask.size();
10008 unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
10009 bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
10011 // Check that the mask is a broadcast.
10012 int BroadcastIdx = -1;
10013 for (int i = 0; i != (int)NumElts; ++i) {
10014 SmallVector<int, 8> BroadcastMask(NumElts, i);
10015 if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
10021 if (BroadcastIdx < 0)
10023 assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
10024 "a sorted mask where the broadcast "
10027 // Go up the chain of (vector) values to find a scalar load that we can
10028 // combine with the broadcast.
10031 switch (V.getOpcode()) {
10032 case ISD::BITCAST: {
10033 SDValue VSrc = V.getOperand(0);
10034 MVT SrcVT = VSrc.getSimpleValueType();
10035 if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
10040 case ISD::CONCAT_VECTORS: {
10041 int OperandSize = Mask.size() / V.getNumOperands();
10042 V = V.getOperand(BroadcastIdx / OperandSize);
10043 BroadcastIdx %= OperandSize;
10046 case ISD::INSERT_SUBVECTOR: {
10047 SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
10048 auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
10052 int BeginIdx = (int)ConstantIdx->getZExtValue();
10054 BeginIdx + (int)VInner.getSimpleValueType().getVectorNumElements();
10055 if (BroadcastIdx >= BeginIdx && BroadcastIdx < EndIdx) {
10056 BroadcastIdx -= BeginIdx;
10067 // Check if this is a broadcast of a scalar. We special case lowering
10068 // for scalars so that we can more effectively fold with loads.
10069 // First, look through bitcast: if the original value has a larger element
10070 // type than the shuffle, the broadcast element is in essence truncated.
10071 // Make that explicit to ease folding.
10072 if (V.getOpcode() == ISD::BITCAST && VT.isInteger())
10073 if (SDValue TruncBroadcast = lowerVectorShuffleAsTruncBroadcast(
10074 DL, VT, V.getOperand(0), BroadcastIdx, Subtarget, DAG))
10075 return TruncBroadcast;
10077 MVT BroadcastVT = VT;
10079 // Peek through any bitcast (only useful for loads).
10080 SDValue BC = peekThroughBitcasts(V);
10082 // Also check the simpler case, where we can directly reuse the scalar.
10083 if (V.getOpcode() == ISD::BUILD_VECTOR ||
10084 (V.getOpcode() == ISD::SCALAR_TO_VECTOR && BroadcastIdx == 0)) {
10085 V = V.getOperand(BroadcastIdx);
10087 // If we can't broadcast from a register, check that the input is a load.
10088 if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
10090 } else if (MayFoldLoad(BC) && !cast<LoadSDNode>(BC)->isVolatile()) {
10091 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10092 if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
10093 BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
10094 Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
10097 // If we are broadcasting a load that is only used by the shuffle
10098 // then we can reduce the vector load to the broadcasted scalar load.
10099 LoadSDNode *Ld = cast<LoadSDNode>(BC);
10100 SDValue BaseAddr = Ld->getOperand(1);
10101 EVT SVT = BroadcastVT.getScalarType();
10102 unsigned Offset = BroadcastIdx * SVT.getStoreSize();
10103 SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
10104 V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
10105 DAG.getMachineFunction().getMachineMemOperand(
10106 Ld->getMemOperand(), Offset, SVT.getStoreSize()));
10107 DAG.makeEquivalentMemoryOrdering(Ld, V);
10108 } else if (!BroadcastFromReg) {
10109 // We can't broadcast from a vector register.
10111 } else if (BroadcastIdx != 0) {
10112 // We can only broadcast from the zero-element of a vector register,
10113 // but it can be advantageous to broadcast from the zero-element of a
10115 if (!VT.is256BitVector() && !VT.is512BitVector())
10118 // VPERMQ/VPERMPD can perform the cross-lane shuffle directly.
10119 if (VT == MVT::v4f64 || VT == MVT::v4i64)
10122 // Only broadcast the zero-element of a 128-bit subvector.
10123 unsigned EltSize = VT.getScalarSizeInBits();
10124 if (((BroadcastIdx * EltSize) % 128) != 0)
10127 // The shuffle input might have been a bitcast we looked through; look at
10128 // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
10129 // later bitcast it to BroadcastVT.
10130 MVT SrcVT = V.getSimpleValueType();
10131 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10132 "Unexpected vector element size");
10133 assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
10134 "Unexpected vector size");
10136 MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
10137 V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
10138 DAG.getIntPtrConstant(BroadcastIdx, DL));
10141 if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
10142 V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
10143 DAG.getBitcast(MVT::f64, V));
10145 // Bitcast back to the same scalar type as BroadcastVT.
10146 MVT SrcVT = V.getSimpleValueType();
10147 if (SrcVT.getScalarType() != BroadcastVT.getScalarType()) {
10148 assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
10149 "Unexpected vector element size");
10150 if (SrcVT.isVector()) {
10151 unsigned NumSrcElts = SrcVT.getVectorNumElements();
10152 SrcVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
10154 SrcVT = BroadcastVT.getScalarType();
10156 V = DAG.getBitcast(SrcVT, V);
10159 // 32-bit targets need to load i64 as a f64 and then bitcast the result.
10160 if (!Subtarget.is64Bit() && SrcVT == MVT::i64) {
10161 V = DAG.getBitcast(MVT::f64, V);
10162 unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
10163 BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
10166 // We only support broadcasting from 128-bit vectors to minimize the
10167 // number of patterns we need to deal with in isel. So extract down to
10169 if (SrcVT.getSizeInBits() > 128)
10170 V = extract128BitVector(V, 0, DAG, DL);
10172 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
10175 // Check for whether we can use INSERTPS to perform the shuffle. We only use
10176 // INSERTPS when the V1 elements are already in the correct locations
10177 // because otherwise we can just always use two SHUFPS instructions which
10178 // are much smaller to encode than a SHUFPS and an INSERTPS. We can also
10179 // perform INSERTPS if a single V1 element is out of place and all V2
10180 // elements are zeroable.
10181 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
10182 unsigned &InsertPSMask,
10183 const APInt &Zeroable,
10184 ArrayRef<int> Mask,
10185 SelectionDAG &DAG) {
10186 assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
10187 assert(V2.getSimpleValueType().is128BitVector() && "Bad operand type!");
10188 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10190 // Attempt to match INSERTPS with one element from VA or VB being
10191 // inserted into VA (or undef). If successful, V1, V2 and InsertPSMask
10193 auto matchAsInsertPS = [&](SDValue VA, SDValue VB,
10194 ArrayRef<int> CandidateMask) {
10195 unsigned ZMask = 0;
10196 int VADstIndex = -1;
10197 int VBDstIndex = -1;
10198 bool VAUsedInPlace = false;
10200 for (int i = 0; i < 4; ++i) {
10201 // Synthesize a zero mask from the zeroable elements (includes undefs).
10207 // Flag if we use any VA inputs in place.
10208 if (i == CandidateMask[i]) {
10209 VAUsedInPlace = true;
10213 // We can only insert a single non-zeroable element.
10214 if (VADstIndex >= 0 || VBDstIndex >= 0)
10217 if (CandidateMask[i] < 4) {
10218 // VA input out of place for insertion.
10221 // VB input for insertion.
10226 // Don't bother if we have no (non-zeroable) element for insertion.
10227 if (VADstIndex < 0 && VBDstIndex < 0)
10230 // Determine element insertion src/dst indices. The src index is from the
10231 // start of the inserted vector, not the start of the concatenated vector.
10232 unsigned VBSrcIndex = 0;
10233 if (VADstIndex >= 0) {
10234 // If we have a VA input out of place, we use VA as the V2 element
10235 // insertion and don't use the original V2 at all.
10236 VBSrcIndex = CandidateMask[VADstIndex];
10237 VBDstIndex = VADstIndex;
10240 VBSrcIndex = CandidateMask[VBDstIndex] - 4;
10243 // If no V1 inputs are used in place, then the result is created only from
10244 // the zero mask and the V2 insertion - so remove V1 dependency.
10245 if (!VAUsedInPlace)
10246 VA = DAG.getUNDEF(MVT::v4f32);
10248 // Update V1, V2 and InsertPSMask accordingly.
10252 // Insert the V2 element into the desired position.
10253 InsertPSMask = VBSrcIndex << 6 | VBDstIndex << 4 | ZMask;
10254 assert((InsertPSMask & ~0xFFu) == 0 && "Invalid mask!");
10258 if (matchAsInsertPS(V1, V2, Mask))
10261 // Commute and try again.
10262 SmallVector<int, 4> CommutedMask(Mask.begin(), Mask.end());
10263 ShuffleVectorSDNode::commuteMask(CommutedMask);
10264 if (matchAsInsertPS(V2, V1, CommutedMask))
10270 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
10271 SDValue V2, ArrayRef<int> Mask,
10272 const APInt &Zeroable,
10273 SelectionDAG &DAG) {
10274 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10275 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10277 // Attempt to match the insertps pattern.
10278 unsigned InsertPSMask;
10279 if (!matchVectorShuffleAsInsertPS(V1, V2, InsertPSMask, Zeroable, Mask, DAG))
10282 // Insert the V2 element into the desired position.
10283 return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, V1, V2,
10284 DAG.getConstant(InsertPSMask, DL, MVT::i8));
10287 /// \brief Try to lower a shuffle as a permute of the inputs followed by an
10288 /// UNPCK instruction.
10290 /// This specifically targets cases where we end up with alternating between
10291 /// the two inputs, and so can permute them into something that feeds a single
10292 /// UNPCK instruction. Note that this routine only targets integer vectors
10293 /// because for floating point vectors we have a generalized SHUFPS lowering
10294 /// strategy that handles everything that doesn't *exactly* match an unpack,
10295 /// making this clever lowering unnecessary.
10296 static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
10297 SDValue V1, SDValue V2,
10298 ArrayRef<int> Mask,
10299 SelectionDAG &DAG) {
10300 assert(!VT.isFloatingPoint() &&
10301 "This routine only supports integer vectors.");
10302 assert(VT.is128BitVector() &&
10303 "This routine only works on 128-bit vectors.");
10304 assert(!V2.isUndef() &&
10305 "This routine should only be used when blending two inputs.");
10306 assert(Mask.size() >= 2 && "Single element masks are invalid.");
10308 int Size = Mask.size();
10311 count_if(Mask, [Size](int M) { return M >= 0 && M % Size < Size / 2; });
10313 count_if(Mask, [Size](int M) { return M % Size >= Size / 2; });
10315 bool UnpackLo = NumLoInputs >= NumHiInputs;
10317 auto TryUnpack = [&](int ScalarSize, int Scale) {
10318 SmallVector<int, 16> V1Mask((unsigned)Size, -1);
10319 SmallVector<int, 16> V2Mask((unsigned)Size, -1);
10321 for (int i = 0; i < Size; ++i) {
10325 // Each element of the unpack contains Scale elements from this mask.
10326 int UnpackIdx = i / Scale;
10328 // We only handle the case where V1 feeds the first slots of the unpack.
10329 // We rely on canonicalization to ensure this is the case.
10330 if ((UnpackIdx % 2 == 0) != (Mask[i] < Size))
10333 // Setup the mask for this input. The indexing is tricky as we have to
10334 // handle the unpack stride.
10335 SmallVectorImpl<int> &VMask = (UnpackIdx % 2 == 0) ? V1Mask : V2Mask;
10336 VMask[(UnpackIdx / 2) * Scale + i % Scale + (UnpackLo ? 0 : Size / 2)] =
10340 // If we will have to shuffle both inputs to use the unpack, check whether
10341 // we can just unpack first and shuffle the result. If so, skip this unpack.
10342 if ((NumLoInputs == 0 || NumHiInputs == 0) && !isNoopShuffleMask(V1Mask) &&
10343 !isNoopShuffleMask(V2Mask))
10346 // Shuffle the inputs into place.
10347 V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), V1Mask);
10348 V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), V2Mask);
10350 // Cast the inputs to the type we will use to unpack them.
10351 MVT UnpackVT = MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), Size / Scale);
10352 V1 = DAG.getBitcast(UnpackVT, V1);
10353 V2 = DAG.getBitcast(UnpackVT, V2);
10355 // Unpack the inputs and cast the result back to the desired type.
10356 return DAG.getBitcast(
10357 VT, DAG.getNode(UnpackLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
10358 UnpackVT, V1, V2));
10361 // We try each unpack from the largest to the smallest to try and find one
10362 // that fits this mask.
10363 int OrigScalarSize = VT.getScalarSizeInBits();
10364 for (int ScalarSize = 64; ScalarSize >= OrigScalarSize; ScalarSize /= 2)
10365 if (SDValue Unpack = TryUnpack(ScalarSize, ScalarSize / OrigScalarSize))
10368 // If none of the unpack-rooted lowerings worked (or were profitable) try an
10370 if (NumLoInputs == 0 || NumHiInputs == 0) {
10371 assert((NumLoInputs > 0 || NumHiInputs > 0) &&
10372 "We have to have *some* inputs!");
10373 int HalfOffset = NumLoInputs == 0 ? Size / 2 : 0;
10375 // FIXME: We could consider the total complexity of the permute of each
10376 // possible unpacking. Or at the least we should consider how many
10377 // half-crossings are created.
10378 // FIXME: We could consider commuting the unpacks.
10380 SmallVector<int, 32> PermMask((unsigned)Size, -1);
10381 for (int i = 0; i < Size; ++i) {
10385 assert(Mask[i] % Size >= HalfOffset && "Found input from wrong half!");
10388 2 * ((Mask[i] % Size) - HalfOffset) + (Mask[i] < Size ? 0 : 1);
10390 return DAG.getVectorShuffle(
10391 VT, DL, DAG.getNode(NumLoInputs == 0 ? X86ISD::UNPCKH : X86ISD::UNPCKL,
10393 DAG.getUNDEF(VT), PermMask);
10399 /// \brief Handle lowering of 2-lane 64-bit floating point shuffles.
10401 /// This is the basis function for the 2-lane 64-bit shuffles as we have full
10402 /// support for floating point shuffles but not integer shuffles. These
10403 /// instructions will incur a domain crossing penalty on some chips though so
10404 /// it is better to avoid lowering through this for integer vectors where
10406 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10407 const APInt &Zeroable,
10408 SDValue V1, SDValue V2,
10409 const X86Subtarget &Subtarget,
10410 SelectionDAG &DAG) {
10411 assert(V1.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10412 assert(V2.getSimpleValueType() == MVT::v2f64 && "Bad operand type!");
10413 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10415 if (V2.isUndef()) {
10416 // Check for being able to broadcast a single element.
10417 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10418 DL, MVT::v2f64, V1, V2, Mask, Subtarget, DAG))
10421 // Straight shuffle of a single input vector. Simulate this by using the
10422 // single input as both of the "inputs" to this instruction..
10423 unsigned SHUFPDMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1);
10425 if (Subtarget.hasAVX()) {
10426 // If we have AVX, we can use VPERMILPS which will allow folding a load
10427 // into the shuffle.
10428 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v2f64, V1,
10429 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10432 return DAG.getNode(
10433 X86ISD::SHUFP, DL, MVT::v2f64,
10434 Mask[0] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10435 Mask[1] == SM_SentinelUndef ? DAG.getUNDEF(MVT::v2f64) : V1,
10436 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10438 assert(Mask[0] >= 0 && Mask[0] < 2 && "Non-canonicalized blend!");
10439 assert(Mask[1] >= 2 && "Non-canonicalized blend!");
10441 // If we have a single input, insert that into V1 if we can do so cheaply.
10442 if ((Mask[0] >= 2) + (Mask[1] >= 2) == 1) {
10443 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10444 DL, MVT::v2f64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10446 // Try inverting the insertion since for v2 masks it is easy to do and we
10447 // can't reliably sort the mask one way or the other.
10448 int InverseMask[2] = {Mask[0] < 0 ? -1 : (Mask[0] ^ 2),
10449 Mask[1] < 0 ? -1 : (Mask[1] ^ 2)};
10450 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10451 DL, MVT::v2f64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10455 // Try to use one of the special instruction patterns to handle two common
10456 // blend patterns if a zero-blend above didn't work.
10457 if (isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
10458 isShuffleEquivalent(V1, V2, Mask, {1, 3}))
10459 if (SDValue V1S = getScalarValueForVectorElement(V1, Mask[0], DAG))
10460 // We can either use a special instruction to load over the low double or
10461 // to move just the low double.
10462 return DAG.getNode(
10463 isShuffleFoldableLoad(V1S) ? X86ISD::MOVLPD : X86ISD::MOVSD,
10464 DL, MVT::v2f64, V2,
10465 DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64, V1S));
10467 if (Subtarget.hasSSE41())
10468 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2f64, V1, V2, Mask,
10469 Zeroable, Subtarget, DAG))
10472 // Use dedicated unpack instructions for masks that match their pattern.
10474 lowerVectorShuffleWithUNPCK(DL, MVT::v2f64, Mask, V1, V2, DAG))
10477 unsigned SHUFPDMask = (Mask[0] == 1) | (((Mask[1] - 2) == 1) << 1);
10478 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v2f64, V1, V2,
10479 DAG.getConstant(SHUFPDMask, DL, MVT::i8));
10482 /// \brief Handle lowering of 2-lane 64-bit integer shuffles.
10484 /// Tries to lower a 2-lane 64-bit shuffle using shuffle operations provided by
10485 /// the integer unit to minimize domain crossing penalties. However, for blends
10486 /// it falls back to the floating point shuffle operation with appropriate bit
10488 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10489 const APInt &Zeroable,
10490 SDValue V1, SDValue V2,
10491 const X86Subtarget &Subtarget,
10492 SelectionDAG &DAG) {
10493 assert(V1.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10494 assert(V2.getSimpleValueType() == MVT::v2i64 && "Bad operand type!");
10495 assert(Mask.size() == 2 && "Unexpected mask size for v2 shuffle!");
10497 if (V2.isUndef()) {
10498 // Check for being able to broadcast a single element.
10499 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10500 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10503 // Straight shuffle of a single input vector. For everything from SSE2
10504 // onward this has a single fast instruction with no scary immediates.
10505 // We have to map the mask as it is actually a v4i32 shuffle instruction.
10506 V1 = DAG.getBitcast(MVT::v4i32, V1);
10507 int WidenedMask[4] = {
10508 std::max(Mask[0], 0) * 2, std::max(Mask[0], 0) * 2 + 1,
10509 std::max(Mask[1], 0) * 2, std::max(Mask[1], 0) * 2 + 1};
10510 return DAG.getBitcast(
10512 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10513 getV4X86ShuffleImm8ForMask(WidenedMask, DL, DAG)));
10515 assert(Mask[0] != -1 && "No undef lanes in multi-input v2 shuffles!");
10516 assert(Mask[1] != -1 && "No undef lanes in multi-input v2 shuffles!");
10517 assert(Mask[0] < 2 && "We sort V1 to be the first input.");
10518 assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
10520 // If we have a blend of two same-type PACKUS operations and the blend aligns
10521 // with the low and high halves, we can just merge the PACKUS operations.
10522 // This is particularly important as it lets us merge shuffles that this
10523 // routine itself creates.
10524 auto GetPackNode = [](SDValue V) {
10525 V = peekThroughBitcasts(V);
10526 return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
10528 if (SDValue V1Pack = GetPackNode(V1))
10529 if (SDValue V2Pack = GetPackNode(V2)) {
10530 EVT PackVT = V1Pack.getValueType();
10531 if (PackVT == V2Pack.getValueType())
10532 return DAG.getBitcast(MVT::v2i64,
10533 DAG.getNode(X86ISD::PACKUS, DL, PackVT,
10534 Mask[0] == 0 ? V1Pack.getOperand(0)
10535 : V1Pack.getOperand(1),
10536 Mask[1] == 2 ? V2Pack.getOperand(0)
10537 : V2Pack.getOperand(1)));
10540 // Try to use shift instructions.
10541 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
10542 Zeroable, Subtarget, DAG))
10545 // When loading a scalar and then shuffling it into a vector we can often do
10546 // the insertion cheaply.
10547 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10548 DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG))
10550 // Try inverting the insertion since for v2 masks it is easy to do and we
10551 // can't reliably sort the mask one way or the other.
10552 int InverseMask[2] = {Mask[0] ^ 2, Mask[1] ^ 2};
10553 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
10554 DL, MVT::v2i64, V2, V1, InverseMask, Zeroable, Subtarget, DAG))
10557 // We have different paths for blend lowering, but they all must use the
10558 // *exact* same predicate.
10559 bool IsBlendSupported = Subtarget.hasSSE41();
10560 if (IsBlendSupported)
10561 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v2i64, V1, V2, Mask,
10562 Zeroable, Subtarget, DAG))
10565 // Use dedicated unpack instructions for masks that match their pattern.
10567 lowerVectorShuffleWithUNPCK(DL, MVT::v2i64, Mask, V1, V2, DAG))
10570 // Try to use byte rotation instructions.
10571 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10572 if (Subtarget.hasSSSE3())
10573 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10574 DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
10577 // If we have direct support for blends, we should lower by decomposing into
10578 // a permute. That will be faster than the domain cross.
10579 if (IsBlendSupported)
10580 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v2i64, V1, V2,
10583 // We implement this with SHUFPD which is pretty lame because it will likely
10584 // incur 2 cycles of stall for integer vectors on Nehalem and older chips.
10585 // However, all the alternatives are still more cycles and newer chips don't
10586 // have this problem. It would be really nice if x86 had better shuffles here.
10587 V1 = DAG.getBitcast(MVT::v2f64, V1);
10588 V2 = DAG.getBitcast(MVT::v2f64, V2);
10589 return DAG.getBitcast(MVT::v2i64,
10590 DAG.getVectorShuffle(MVT::v2f64, DL, V1, V2, Mask));
10593 /// \brief Test whether this can be lowered with a single SHUFPS instruction.
10595 /// This is used to disable more specialized lowerings when the shufps lowering
10596 /// will happen to be efficient.
10597 static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
10598 // This routine only handles 128-bit shufps.
10599 assert(Mask.size() == 4 && "Unsupported mask size!");
10600 assert(Mask[0] >= -1 && Mask[0] < 8 && "Out of bound mask element!");
10601 assert(Mask[1] >= -1 && Mask[1] < 8 && "Out of bound mask element!");
10602 assert(Mask[2] >= -1 && Mask[2] < 8 && "Out of bound mask element!");
10603 assert(Mask[3] >= -1 && Mask[3] < 8 && "Out of bound mask element!");
10605 // To lower with a single SHUFPS we need to have the low half and high half
10606 // each requiring a single input.
10607 if (Mask[0] >= 0 && Mask[1] >= 0 && (Mask[0] < 4) != (Mask[1] < 4))
10609 if (Mask[2] >= 0 && Mask[3] >= 0 && (Mask[2] < 4) != (Mask[3] < 4))
10615 /// \brief Lower a vector shuffle using the SHUFPS instruction.
10617 /// This is a helper routine dedicated to lowering vector shuffles using SHUFPS.
10618 /// It makes no assumptions about whether this is the *best* lowering, it simply
10620 static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
10621 ArrayRef<int> Mask, SDValue V1,
10622 SDValue V2, SelectionDAG &DAG) {
10623 SDValue LowV = V1, HighV = V2;
10624 int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
10626 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10628 if (NumV2Elements == 1) {
10629 int V2Index = find_if(Mask, [](int M) { return M >= 4; }) - Mask.begin();
10631 // Compute the index adjacent to V2Index and in the same half by toggling
10633 int V2AdjIndex = V2Index ^ 1;
10635 if (Mask[V2AdjIndex] < 0) {
10636 // Handles all the cases where we have a single V2 element and an undef.
10637 // This will only ever happen in the high lanes because we commute the
10638 // vector otherwise.
10640 std::swap(LowV, HighV);
10641 NewMask[V2Index] -= 4;
10643 // Handle the case where the V2 element ends up adjacent to a V1 element.
10644 // To make this work, blend them together as the first step.
10645 int V1Index = V2AdjIndex;
10646 int BlendMask[4] = {Mask[V2Index] - 4, 0, Mask[V1Index], 0};
10647 V2 = DAG.getNode(X86ISD::SHUFP, DL, VT, V2, V1,
10648 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10650 // Now proceed to reconstruct the final blend as we have the necessary
10651 // high or low half formed.
10658 NewMask[V1Index] = 2; // We put the V1 element in V2[2].
10659 NewMask[V2Index] = 0; // We shifted the V2 element into V2[0].
10661 } else if (NumV2Elements == 2) {
10662 if (Mask[0] < 4 && Mask[1] < 4) {
10663 // Handle the easy case where we have V1 in the low lanes and V2 in the
10667 } else if (Mask[2] < 4 && Mask[3] < 4) {
10668 // We also handle the reversed case because this utility may get called
10669 // when we detect a SHUFPS pattern but can't easily commute the shuffle to
10670 // arrange things in the right direction.
10676 // We have a mixture of V1 and V2 in both low and high lanes. Rather than
10677 // trying to place elements directly, just blend them and set up the final
10678 // shuffle to place them.
10680 // The first two blend mask elements are for V1, the second two are for
10682 int BlendMask[4] = {Mask[0] < 4 ? Mask[0] : Mask[1],
10683 Mask[2] < 4 ? Mask[2] : Mask[3],
10684 (Mask[0] >= 4 ? Mask[0] : Mask[1]) - 4,
10685 (Mask[2] >= 4 ? Mask[2] : Mask[3]) - 4};
10686 V1 = DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
10687 getV4X86ShuffleImm8ForMask(BlendMask, DL, DAG));
10689 // Now we do a normal shuffle of V1 by giving V1 as both operands to
10692 NewMask[0] = Mask[0] < 4 ? 0 : 2;
10693 NewMask[1] = Mask[0] < 4 ? 2 : 0;
10694 NewMask[2] = Mask[2] < 4 ? 1 : 3;
10695 NewMask[3] = Mask[2] < 4 ? 3 : 1;
10698 return DAG.getNode(X86ISD::SHUFP, DL, VT, LowV, HighV,
10699 getV4X86ShuffleImm8ForMask(NewMask, DL, DAG));
10702 /// \brief Lower 4-lane 32-bit floating point shuffles.
10704 /// Uses instructions exclusively from the floating point unit to minimize
10705 /// domain crossing penalties, as these are sufficient to implement all v4f32
10707 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10708 const APInt &Zeroable,
10709 SDValue V1, SDValue V2,
10710 const X86Subtarget &Subtarget,
10711 SelectionDAG &DAG) {
10712 assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10713 assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
10714 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10716 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10718 if (NumV2Elements == 0) {
10719 // Check for being able to broadcast a single element.
10720 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10721 DL, MVT::v4f32, V1, V2, Mask, Subtarget, DAG))
10724 // Use even/odd duplicate instructions for masks that match their pattern.
10725 if (Subtarget.hasSSE3()) {
10726 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
10727 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v4f32, V1);
10728 if (isShuffleEquivalent(V1, V2, Mask, {1, 1, 3, 3}))
10729 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v4f32, V1);
10732 if (Subtarget.hasAVX()) {
10733 // If we have AVX, we can use VPERMILPS which will allow folding a load
10734 // into the shuffle.
10735 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f32, V1,
10736 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10739 // Otherwise, use a straight shuffle of a single input vector. We pass the
10740 // input vector to both operands to simulate this with a SHUFPS.
10741 return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
10742 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10745 // There are special ways we can lower some single-element blends. However, we
10746 // have custom ways we can lower more complex single-element blends below that
10747 // we defer to if both this and BLENDPS fail to match, so restrict this to
10748 // when the V2 input is targeting element 0 of the mask -- that is the fast
10750 if (NumV2Elements == 1 && Mask[0] >= 4)
10751 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10752 DL, MVT::v4f32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10755 if (Subtarget.hasSSE41()) {
10756 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f32, V1, V2, Mask,
10757 Zeroable, Subtarget, DAG))
10760 // Use INSERTPS if we can complete the shuffle efficiently.
10762 lowerVectorShuffleAsInsertPS(DL, V1, V2, Mask, Zeroable, DAG))
10765 if (!isSingleSHUFPSMask(Mask))
10766 if (SDValue BlendPerm = lowerVectorShuffleAsBlendAndPermute(
10767 DL, MVT::v4f32, V1, V2, Mask, DAG))
10771 // Use low/high mov instructions.
10772 if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
10773 return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
10774 if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
10775 return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
10777 // Use dedicated unpack instructions for masks that match their pattern.
10779 lowerVectorShuffleWithUNPCK(DL, MVT::v4f32, Mask, V1, V2, DAG))
10782 // Otherwise fall back to a SHUFPS lowering strategy.
10783 return lowerVectorShuffleWithSHUFPS(DL, MVT::v4f32, Mask, V1, V2, DAG);
10786 /// \brief Lower 4-lane i32 vector shuffles.
10788 /// We try to handle these with integer-domain shuffles where we can, but for
10789 /// blends we use the floating point domain blend instructions.
10790 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
10791 const APInt &Zeroable,
10792 SDValue V1, SDValue V2,
10793 const X86Subtarget &Subtarget,
10794 SelectionDAG &DAG) {
10795 assert(V1.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10796 assert(V2.getSimpleValueType() == MVT::v4i32 && "Bad operand type!");
10797 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
10799 // Whenever we can lower this as a zext, that instruction is strictly faster
10800 // than any alternative. It also allows us to fold memory operands into the
10801 // shuffle in many cases.
10802 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
10803 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10806 int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
10808 if (NumV2Elements == 0) {
10809 // Check for being able to broadcast a single element.
10810 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
10811 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10814 // Straight shuffle of a single input vector. For everything from SSE2
10815 // onward this has a single fast instruction with no scary immediates.
10816 // We coerce the shuffle pattern to be compatible with UNPCK instructions
10817 // but we aren't actually going to use the UNPCK instruction because doing
10818 // so prevents folding a load into this instruction or making a copy.
10819 const int UnpackLoMask[] = {0, 0, 1, 1};
10820 const int UnpackHiMask[] = {2, 2, 3, 3};
10821 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 1, 1}))
10822 Mask = UnpackLoMask;
10823 else if (isShuffleEquivalent(V1, V2, Mask, {2, 2, 3, 3}))
10824 Mask = UnpackHiMask;
10826 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v4i32, V1,
10827 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
10830 // Try to use shift instructions.
10831 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i32, V1, V2, Mask,
10832 Zeroable, Subtarget, DAG))
10835 // There are special ways we can lower some single-element blends.
10836 if (NumV2Elements == 1)
10837 if (SDValue V = lowerVectorShuffleAsElementInsertion(
10838 DL, MVT::v4i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
10841 // We have different paths for blend lowering, but they all must use the
10842 // *exact* same predicate.
10843 bool IsBlendSupported = Subtarget.hasSSE41();
10844 if (IsBlendSupported)
10845 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i32, V1, V2, Mask,
10846 Zeroable, Subtarget, DAG))
10849 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v4i32, V1, V2, Mask,
10853 // Use dedicated unpack instructions for masks that match their pattern.
10855 lowerVectorShuffleWithUNPCK(DL, MVT::v4i32, Mask, V1, V2, DAG))
10858 // Try to use byte rotation instructions.
10859 // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
10860 if (Subtarget.hasSSSE3())
10861 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
10862 DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
10865 // Assume that a single SHUFPS is faster than an alternative sequence of
10866 // multiple instructions (even if the CPU has a domain penalty).
10867 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
10868 if (!isSingleSHUFPSMask(Mask)) {
10869 // If we have direct support for blends, we should lower by decomposing into
10870 // a permute. That will be faster than the domain cross.
10871 if (IsBlendSupported)
10872 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i32, V1, V2,
10875 // Try to lower by permuting the inputs into an unpack instruction.
10876 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
10877 DL, MVT::v4i32, V1, V2, Mask, DAG))
10881 // We implement this with SHUFPS because it can blend from two vectors.
10882 // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
10883 // up the inputs, bypassing domain shift penalties that we would incur if we
10884 // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
10886 SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
10887 SDValue CastV2 = DAG.getBitcast(MVT::v4f32, V2);
10888 SDValue ShufPS = DAG.getVectorShuffle(MVT::v4f32, DL, CastV1, CastV2, Mask);
10889 return DAG.getBitcast(MVT::v4i32, ShufPS);
10892 /// \brief Lowering of single-input v8i16 shuffles is the cornerstone of SSE2
10893 /// shuffle lowering, and the most complex part.
10895 /// The lowering strategy is to try to form pairs of input lanes which are
10896 /// targeted at the same half of the final vector, and then use a dword shuffle
10897 /// to place them onto the right half, and finally unpack the paired lanes into
10898 /// their final position.
10900 /// The exact breakdown of how to form these dword pairs and align them on the
10901 /// correct sides is really tricky. See the comments within the function for
10902 /// more of the details.
10904 /// This code also handles repeated 128-bit lanes of v8i16 shuffles, but each
10905 /// lane must shuffle the *exact* same way. In fact, you must pass a v8 Mask to
10906 /// this routine for it to work correctly. To shuffle a 256-bit or 512-bit i16
10907 /// vector, form the analogous 128-bit 8-element Mask.
10908 static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
10909 const SDLoc &DL, MVT VT, SDValue V, MutableArrayRef<int> Mask,
10910 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
10911 assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
10912 MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
10914 assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
10915 MutableArrayRef<int> LoMask = Mask.slice(0, 4);
10916 MutableArrayRef<int> HiMask = Mask.slice(4, 4);
10918 SmallVector<int, 4> LoInputs;
10919 copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
10920 std::sort(LoInputs.begin(), LoInputs.end());
10921 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
10922 SmallVector<int, 4> HiInputs;
10923 copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
10924 std::sort(HiInputs.begin(), HiInputs.end());
10925 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
10927 std::lower_bound(LoInputs.begin(), LoInputs.end(), 4) - LoInputs.begin();
10928 int NumHToL = LoInputs.size() - NumLToL;
10930 std::lower_bound(HiInputs.begin(), HiInputs.end(), 4) - HiInputs.begin();
10931 int NumHToH = HiInputs.size() - NumLToH;
10932 MutableArrayRef<int> LToLInputs(LoInputs.data(), NumLToL);
10933 MutableArrayRef<int> LToHInputs(HiInputs.data(), NumLToH);
10934 MutableArrayRef<int> HToLInputs(LoInputs.data() + NumLToL, NumHToL);
10935 MutableArrayRef<int> HToHInputs(HiInputs.data() + NumLToH, NumHToH);
10937 // If we are splatting two values from one half - one to each half, then
10938 // we can shuffle that half so each is splatted to a dword, then splat those
10939 // to their respective halves.
10940 auto SplatHalfs = [&](int LoInput, int HiInput, unsigned ShufWOp,
10942 int PSHUFHalfMask[] = {LoInput % 4, LoInput % 4, HiInput % 4, HiInput % 4};
10943 int PSHUFDMask[] = {DOffset + 0, DOffset + 0, DOffset + 1, DOffset + 1};
10944 V = DAG.getNode(ShufWOp, DL, VT, V,
10945 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
10946 V = DAG.getBitcast(PSHUFDVT, V);
10947 V = DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, V,
10948 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG));
10949 return DAG.getBitcast(VT, V);
10952 if (NumLToL == 1 && NumLToH == 1 && (NumHToL + NumHToH) == 0)
10953 return SplatHalfs(LToLInputs[0], LToHInputs[0], X86ISD::PSHUFLW, 0);
10954 if (NumHToL == 1 && NumHToH == 1 && (NumLToL + NumLToH) == 0)
10955 return SplatHalfs(HToLInputs[0], HToHInputs[0], X86ISD::PSHUFHW, 2);
10957 // Simplify the 1-into-3 and 3-into-1 cases with a single pshufd. For all
10958 // such inputs we can swap two of the dwords across the half mark and end up
10959 // with <=2 inputs to each half in each half. Once there, we can fall through
10960 // to the generic code below. For example:
10962 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10963 // Mask: [0, 1, 2, 7, 4, 5, 6, 3] -----------------> [0, 1, 4, 7, 2, 3, 6, 5]
10965 // However in some very rare cases we have a 1-into-3 or 3-into-1 on one half
10966 // and an existing 2-into-2 on the other half. In this case we may have to
10967 // pre-shuffle the 2-into-2 half to avoid turning it into a 3-into-1 or
10968 // 1-into-3 which could cause us to cycle endlessly fixing each side in turn.
10969 // Fortunately, we don't have to handle anything but a 2-into-2 pattern
10970 // because any other situation (including a 3-into-1 or 1-into-3 in the other
10971 // half than the one we target for fixing) will be fixed when we re-enter this
10972 // path. We will also combine away any sequence of PSHUFD instructions that
10973 // result into a single instruction. Here is an example of the tricky case:
10975 // Input: [a, b, c, d, e, f, g, h] -PSHUFD[0,2,1,3]-> [a, b, e, f, c, d, g, h]
10976 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -THIS-IS-BAD!!!!-> [5, 7, 1, 0, 4, 7, 5, 3]
10978 // This now has a 1-into-3 in the high half! Instead, we do two shuffles:
10980 // Input: [a, b, c, d, e, f, g, h] PSHUFHW[0,2,1,3]-> [a, b, c, d, e, g, f, h]
10981 // Mask: [3, 7, 1, 0, 2, 7, 3, 5] -----------------> [3, 7, 1, 0, 2, 7, 3, 6]
10983 // Input: [a, b, c, d, e, g, f, h] -PSHUFD[0,2,1,3]-> [a, b, e, g, c, d, f, h]
10984 // Mask: [3, 7, 1, 0, 2, 7, 3, 6] -----------------> [5, 7, 1, 0, 4, 7, 5, 6]
10986 // The result is fine to be handled by the generic logic.
10987 auto balanceSides = [&](ArrayRef<int> AToAInputs, ArrayRef<int> BToAInputs,
10988 ArrayRef<int> BToBInputs, ArrayRef<int> AToBInputs,
10989 int AOffset, int BOffset) {
10990 assert((AToAInputs.size() == 3 || AToAInputs.size() == 1) &&
10991 "Must call this with A having 3 or 1 inputs from the A half.");
10992 assert((BToAInputs.size() == 1 || BToAInputs.size() == 3) &&
10993 "Must call this with B having 1 or 3 inputs from the B half.");
10994 assert(AToAInputs.size() + BToAInputs.size() == 4 &&
10995 "Must call this with either 3:1 or 1:3 inputs (summing to 4).");
10997 bool ThreeAInputs = AToAInputs.size() == 3;
10999 // Compute the index of dword with only one word among the three inputs in
11000 // a half by taking the sum of the half with three inputs and subtracting
11001 // the sum of the actual three inputs. The difference is the remaining
11003 int ADWord, BDWord;
11004 int &TripleDWord = ThreeAInputs ? ADWord : BDWord;
11005 int &OneInputDWord = ThreeAInputs ? BDWord : ADWord;
11006 int TripleInputOffset = ThreeAInputs ? AOffset : BOffset;
11007 ArrayRef<int> TripleInputs = ThreeAInputs ? AToAInputs : BToAInputs;
11008 int OneInput = ThreeAInputs ? BToAInputs[0] : AToAInputs[0];
11009 int TripleInputSum = 0 + 1 + 2 + 3 + (4 * TripleInputOffset);
11010 int TripleNonInputIdx =
11011 TripleInputSum - std::accumulate(TripleInputs.begin(), TripleInputs.end(), 0);
11012 TripleDWord = TripleNonInputIdx / 2;
11014 // We use xor with one to compute the adjacent DWord to whichever one the
11016 OneInputDWord = (OneInput / 2) ^ 1;
11018 // Check for one tricky case: We're fixing a 3<-1 or a 1<-3 shuffle for AToA
11019 // and BToA inputs. If there is also such a problem with the BToB and AToB
11020 // inputs, we don't try to fix it necessarily -- we'll recurse and see it in
11021 // the next pass. However, if we have a 2<-2 in the BToB and AToB inputs, it
11022 // is essential that we don't *create* a 3<-1 as then we might oscillate.
11023 if (BToBInputs.size() == 2 && AToBInputs.size() == 2) {
11024 // Compute how many inputs will be flipped by swapping these DWords. We
11026 // to balance this to ensure we don't form a 3-1 shuffle in the other
11028 int NumFlippedAToBInputs =
11029 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord) +
11030 std::count(AToBInputs.begin(), AToBInputs.end(), 2 * ADWord + 1);
11031 int NumFlippedBToBInputs =
11032 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord) +
11033 std::count(BToBInputs.begin(), BToBInputs.end(), 2 * BDWord + 1);
11034 if ((NumFlippedAToBInputs == 1 &&
11035 (NumFlippedBToBInputs == 0 || NumFlippedBToBInputs == 2)) ||
11036 (NumFlippedBToBInputs == 1 &&
11037 (NumFlippedAToBInputs == 0 || NumFlippedAToBInputs == 2))) {
11038 // We choose whether to fix the A half or B half based on whether that
11039 // half has zero flipped inputs. At zero, we may not be able to fix it
11040 // with that half. We also bias towards fixing the B half because that
11041 // will more commonly be the high half, and we have to bias one way.
11042 auto FixFlippedInputs = [&V, &DL, &Mask, &DAG](int PinnedIdx, int DWord,
11043 ArrayRef<int> Inputs) {
11044 int FixIdx = PinnedIdx ^ 1; // The adjacent slot to the pinned slot.
11045 bool IsFixIdxInput = is_contained(Inputs, PinnedIdx ^ 1);
11046 // Determine whether the free index is in the flipped dword or the
11047 // unflipped dword based on where the pinned index is. We use this bit
11048 // in an xor to conditionally select the adjacent dword.
11049 int FixFreeIdx = 2 * (DWord ^ (PinnedIdx / 2 == DWord));
11050 bool IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11051 if (IsFixIdxInput == IsFixFreeIdxInput)
11053 IsFixFreeIdxInput = is_contained(Inputs, FixFreeIdx);
11054 assert(IsFixIdxInput != IsFixFreeIdxInput &&
11055 "We need to be changing the number of flipped inputs!");
11056 int PSHUFHalfMask[] = {0, 1, 2, 3};
11057 std::swap(PSHUFHalfMask[FixFreeIdx % 4], PSHUFHalfMask[FixIdx % 4]);
11059 FixIdx < 4 ? X86ISD::PSHUFLW : X86ISD::PSHUFHW, DL,
11060 MVT::getVectorVT(MVT::i16, V.getValueSizeInBits() / 16), V,
11061 getV4X86ShuffleImm8ForMask(PSHUFHalfMask, DL, DAG));
11063 for (int &M : Mask)
11064 if (M >= 0 && M == FixIdx)
11066 else if (M >= 0 && M == FixFreeIdx)
11069 if (NumFlippedBToBInputs != 0) {
11071 BToAInputs.size() == 3 ? TripleNonInputIdx : OneInput;
11072 FixFlippedInputs(BPinnedIdx, BDWord, BToBInputs);
11074 assert(NumFlippedAToBInputs != 0 && "Impossible given predicates!");
11075 int APinnedIdx = ThreeAInputs ? TripleNonInputIdx : OneInput;
11076 FixFlippedInputs(APinnedIdx, ADWord, AToBInputs);
11081 int PSHUFDMask[] = {0, 1, 2, 3};
11082 PSHUFDMask[ADWord] = BDWord;
11083 PSHUFDMask[BDWord] = ADWord;
11084 V = DAG.getBitcast(
11086 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11087 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11089 // Adjust the mask to match the new locations of A and B.
11090 for (int &M : Mask)
11091 if (M >= 0 && M/2 == ADWord)
11092 M = 2 * BDWord + M % 2;
11093 else if (M >= 0 && M/2 == BDWord)
11094 M = 2 * ADWord + M % 2;
11096 // Recurse back into this routine to re-compute state now that this isn't
11097 // a 3 and 1 problem.
11098 return lowerV8I16GeneralSingleInputVectorShuffle(DL, VT, V, Mask, Subtarget,
11101 if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
11102 return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
11103 if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
11104 return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
11106 // At this point there are at most two inputs to the low and high halves from
11107 // each half. That means the inputs can always be grouped into dwords and
11108 // those dwords can then be moved to the correct half with a dword shuffle.
11109 // We use at most one low and one high word shuffle to collect these paired
11110 // inputs into dwords, and finally a dword shuffle to place them.
11111 int PSHUFLMask[4] = {-1, -1, -1, -1};
11112 int PSHUFHMask[4] = {-1, -1, -1, -1};
11113 int PSHUFDMask[4] = {-1, -1, -1, -1};
11115 // First fix the masks for all the inputs that are staying in their
11116 // original halves. This will then dictate the targets of the cross-half
11118 auto fixInPlaceInputs =
11119 [&PSHUFDMask](ArrayRef<int> InPlaceInputs, ArrayRef<int> IncomingInputs,
11120 MutableArrayRef<int> SourceHalfMask,
11121 MutableArrayRef<int> HalfMask, int HalfOffset) {
11122 if (InPlaceInputs.empty())
11124 if (InPlaceInputs.size() == 1) {
11125 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11126 InPlaceInputs[0] - HalfOffset;
11127 PSHUFDMask[InPlaceInputs[0] / 2] = InPlaceInputs[0] / 2;
11130 if (IncomingInputs.empty()) {
11131 // Just fix all of the in place inputs.
11132 for (int Input : InPlaceInputs) {
11133 SourceHalfMask[Input - HalfOffset] = Input - HalfOffset;
11134 PSHUFDMask[Input / 2] = Input / 2;
11139 assert(InPlaceInputs.size() == 2 && "Cannot handle 3 or 4 inputs!");
11140 SourceHalfMask[InPlaceInputs[0] - HalfOffset] =
11141 InPlaceInputs[0] - HalfOffset;
11142 // Put the second input next to the first so that they are packed into
11143 // a dword. We find the adjacent index by toggling the low bit.
11144 int AdjIndex = InPlaceInputs[0] ^ 1;
11145 SourceHalfMask[AdjIndex - HalfOffset] = InPlaceInputs[1] - HalfOffset;
11146 std::replace(HalfMask.begin(), HalfMask.end(), InPlaceInputs[1], AdjIndex);
11147 PSHUFDMask[AdjIndex / 2] = AdjIndex / 2;
11149 fixInPlaceInputs(LToLInputs, HToLInputs, PSHUFLMask, LoMask, 0);
11150 fixInPlaceInputs(HToHInputs, LToHInputs, PSHUFHMask, HiMask, 4);
11152 // Now gather the cross-half inputs and place them into a free dword of
11153 // their target half.
11154 // FIXME: This operation could almost certainly be simplified dramatically to
11155 // look more like the 3-1 fixing operation.
11156 auto moveInputsToRightHalf = [&PSHUFDMask](
11157 MutableArrayRef<int> IncomingInputs, ArrayRef<int> ExistingInputs,
11158 MutableArrayRef<int> SourceHalfMask, MutableArrayRef<int> HalfMask,
11159 MutableArrayRef<int> FinalSourceHalfMask, int SourceOffset,
11161 auto isWordClobbered = [](ArrayRef<int> SourceHalfMask, int Word) {
11162 return SourceHalfMask[Word] >= 0 && SourceHalfMask[Word] != Word;
11164 auto isDWordClobbered = [&isWordClobbered](ArrayRef<int> SourceHalfMask,
11166 int LowWord = Word & ~1;
11167 int HighWord = Word | 1;
11168 return isWordClobbered(SourceHalfMask, LowWord) ||
11169 isWordClobbered(SourceHalfMask, HighWord);
11172 if (IncomingInputs.empty())
11175 if (ExistingInputs.empty()) {
11176 // Map any dwords with inputs from them into the right half.
11177 for (int Input : IncomingInputs) {
11178 // If the source half mask maps over the inputs, turn those into
11179 // swaps and use the swapped lane.
11180 if (isWordClobbered(SourceHalfMask, Input - SourceOffset)) {
11181 if (SourceHalfMask[SourceHalfMask[Input - SourceOffset]] < 0) {
11182 SourceHalfMask[SourceHalfMask[Input - SourceOffset]] =
11183 Input - SourceOffset;
11184 // We have to swap the uses in our half mask in one sweep.
11185 for (int &M : HalfMask)
11186 if (M == SourceHalfMask[Input - SourceOffset] + SourceOffset)
11188 else if (M == Input)
11189 M = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11191 assert(SourceHalfMask[SourceHalfMask[Input - SourceOffset]] ==
11192 Input - SourceOffset &&
11193 "Previous placement doesn't match!");
11195 // Note that this correctly re-maps both when we do a swap and when
11196 // we observe the other side of the swap above. We rely on that to
11197 // avoid swapping the members of the input list directly.
11198 Input = SourceHalfMask[Input - SourceOffset] + SourceOffset;
11201 // Map the input's dword into the correct half.
11202 if (PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] < 0)
11203 PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] = Input / 2;
11205 assert(PSHUFDMask[(Input - SourceOffset + DestOffset) / 2] ==
11207 "Previous placement doesn't match!");
11210 // And just directly shift any other-half mask elements to be same-half
11211 // as we will have mirrored the dword containing the element into the
11212 // same position within that half.
11213 for (int &M : HalfMask)
11214 if (M >= SourceOffset && M < SourceOffset + 4) {
11215 M = M - SourceOffset + DestOffset;
11216 assert(M >= 0 && "This should never wrap below zero!");
11221 // Ensure we have the input in a viable dword of its current half. This
11222 // is particularly tricky because the original position may be clobbered
11223 // by inputs being moved and *staying* in that half.
11224 if (IncomingInputs.size() == 1) {
11225 if (isWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11226 int InputFixed = find(SourceHalfMask, -1) - std::begin(SourceHalfMask) +
11228 SourceHalfMask[InputFixed - SourceOffset] =
11229 IncomingInputs[0] - SourceOffset;
11230 std::replace(HalfMask.begin(), HalfMask.end(), IncomingInputs[0],
11232 IncomingInputs[0] = InputFixed;
11234 } else if (IncomingInputs.size() == 2) {
11235 if (IncomingInputs[0] / 2 != IncomingInputs[1] / 2 ||
11236 isDWordClobbered(SourceHalfMask, IncomingInputs[0] - SourceOffset)) {
11237 // We have two non-adjacent or clobbered inputs we need to extract from
11238 // the source half. To do this, we need to map them into some adjacent
11239 // dword slot in the source mask.
11240 int InputsFixed[2] = {IncomingInputs[0] - SourceOffset,
11241 IncomingInputs[1] - SourceOffset};
11243 // If there is a free slot in the source half mask adjacent to one of
11244 // the inputs, place the other input in it. We use (Index XOR 1) to
11245 // compute an adjacent index.
11246 if (!isWordClobbered(SourceHalfMask, InputsFixed[0]) &&
11247 SourceHalfMask[InputsFixed[0] ^ 1] < 0) {
11248 SourceHalfMask[InputsFixed[0]] = InputsFixed[0];
11249 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11250 InputsFixed[1] = InputsFixed[0] ^ 1;
11251 } else if (!isWordClobbered(SourceHalfMask, InputsFixed[1]) &&
11252 SourceHalfMask[InputsFixed[1] ^ 1] < 0) {
11253 SourceHalfMask[InputsFixed[1]] = InputsFixed[1];
11254 SourceHalfMask[InputsFixed[1] ^ 1] = InputsFixed[0];
11255 InputsFixed[0] = InputsFixed[1] ^ 1;
11256 } else if (SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] < 0 &&
11257 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] < 0) {
11258 // The two inputs are in the same DWord but it is clobbered and the
11259 // adjacent DWord isn't used at all. Move both inputs to the free
11261 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1)] = InputsFixed[0];
11262 SourceHalfMask[2 * ((InputsFixed[0] / 2) ^ 1) + 1] = InputsFixed[1];
11263 InputsFixed[0] = 2 * ((InputsFixed[0] / 2) ^ 1);
11264 InputsFixed[1] = 2 * ((InputsFixed[0] / 2) ^ 1) + 1;
11266 // The only way we hit this point is if there is no clobbering
11267 // (because there are no off-half inputs to this half) and there is no
11268 // free slot adjacent to one of the inputs. In this case, we have to
11269 // swap an input with a non-input.
11270 for (int i = 0; i < 4; ++i)
11271 assert((SourceHalfMask[i] < 0 || SourceHalfMask[i] == i) &&
11272 "We can't handle any clobbers here!");
11273 assert(InputsFixed[1] != (InputsFixed[0] ^ 1) &&
11274 "Cannot have adjacent inputs here!");
11276 SourceHalfMask[InputsFixed[0] ^ 1] = InputsFixed[1];
11277 SourceHalfMask[InputsFixed[1]] = InputsFixed[0] ^ 1;
11279 // We also have to update the final source mask in this case because
11280 // it may need to undo the above swap.
11281 for (int &M : FinalSourceHalfMask)
11282 if (M == (InputsFixed[0] ^ 1) + SourceOffset)
11283 M = InputsFixed[1] + SourceOffset;
11284 else if (M == InputsFixed[1] + SourceOffset)
11285 M = (InputsFixed[0] ^ 1) + SourceOffset;
11287 InputsFixed[1] = InputsFixed[0] ^ 1;
11290 // Point everything at the fixed inputs.
11291 for (int &M : HalfMask)
11292 if (M == IncomingInputs[0])
11293 M = InputsFixed[0] + SourceOffset;
11294 else if (M == IncomingInputs[1])
11295 M = InputsFixed[1] + SourceOffset;
11297 IncomingInputs[0] = InputsFixed[0] + SourceOffset;
11298 IncomingInputs[1] = InputsFixed[1] + SourceOffset;
11301 llvm_unreachable("Unhandled input size!");
11304 // Now hoist the DWord down to the right half.
11305 int FreeDWord = (PSHUFDMask[DestOffset / 2] < 0 ? 0 : 1) + DestOffset / 2;
11306 assert(PSHUFDMask[FreeDWord] < 0 && "DWord not free");
11307 PSHUFDMask[FreeDWord] = IncomingInputs[0] / 2;
11308 for (int &M : HalfMask)
11309 for (int Input : IncomingInputs)
11311 M = FreeDWord * 2 + Input % 2;
11313 moveInputsToRightHalf(HToLInputs, LToLInputs, PSHUFHMask, LoMask, HiMask,
11314 /*SourceOffset*/ 4, /*DestOffset*/ 0);
11315 moveInputsToRightHalf(LToHInputs, HToHInputs, PSHUFLMask, HiMask, LoMask,
11316 /*SourceOffset*/ 0, /*DestOffset*/ 4);
11318 // Now enact all the shuffles we've computed to move the inputs into their
11320 if (!isNoopShuffleMask(PSHUFLMask))
11321 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11322 getV4X86ShuffleImm8ForMask(PSHUFLMask, DL, DAG));
11323 if (!isNoopShuffleMask(PSHUFHMask))
11324 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11325 getV4X86ShuffleImm8ForMask(PSHUFHMask, DL, DAG));
11326 if (!isNoopShuffleMask(PSHUFDMask))
11327 V = DAG.getBitcast(
11329 DAG.getNode(X86ISD::PSHUFD, DL, PSHUFDVT, DAG.getBitcast(PSHUFDVT, V),
11330 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
11332 // At this point, each half should contain all its inputs, and we can then
11333 // just shuffle them into their final position.
11334 assert(count_if(LoMask, [](int M) { return M >= 4; }) == 0 &&
11335 "Failed to lift all the high half inputs to the low mask!");
11336 assert(count_if(HiMask, [](int M) { return M >= 0 && M < 4; }) == 0 &&
11337 "Failed to lift all the low half inputs to the high mask!");
11339 // Do a half shuffle for the low mask.
11340 if (!isNoopShuffleMask(LoMask))
11341 V = DAG.getNode(X86ISD::PSHUFLW, DL, VT, V,
11342 getV4X86ShuffleImm8ForMask(LoMask, DL, DAG));
11344 // Do a half shuffle with the high mask after shifting its values down.
11345 for (int &M : HiMask)
11348 if (!isNoopShuffleMask(HiMask))
11349 V = DAG.getNode(X86ISD::PSHUFHW, DL, VT, V,
11350 getV4X86ShuffleImm8ForMask(HiMask, DL, DAG));
11355 /// Helper to form a PSHUFB-based shuffle+blend, opportunistically avoiding the
11356 /// blend if only one input is used.
11357 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
11358 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
11359 const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
11361 SDValue V1Mask[16];
11362 SDValue V2Mask[16];
11366 int Size = Mask.size();
11367 int Scale = 16 / Size;
11368 for (int i = 0; i < 16; ++i) {
11369 if (Mask[i / Scale] < 0) {
11370 V1Mask[i] = V2Mask[i] = DAG.getUNDEF(MVT::i8);
11372 const int ZeroMask = 0x80;
11373 int V1Idx = Mask[i / Scale] < Size ? Mask[i / Scale] * Scale + i % Scale
11375 int V2Idx = Mask[i / Scale] < Size
11377 : (Mask[i / Scale] - Size) * Scale + i % Scale;
11378 if (Zeroable[i / Scale])
11379 V1Idx = V2Idx = ZeroMask;
11380 V1Mask[i] = DAG.getConstant(V1Idx, DL, MVT::i8);
11381 V2Mask[i] = DAG.getConstant(V2Idx, DL, MVT::i8);
11382 V1InUse |= (ZeroMask != V1Idx);
11383 V2InUse |= (ZeroMask != V2Idx);
11388 V1 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11389 DAG.getBitcast(MVT::v16i8, V1),
11390 DAG.getBuildVector(MVT::v16i8, DL, V1Mask));
11392 V2 = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v16i8,
11393 DAG.getBitcast(MVT::v16i8, V2),
11394 DAG.getBuildVector(MVT::v16i8, DL, V2Mask));
11396 // If we need shuffled inputs from both, blend the two.
11398 if (V1InUse && V2InUse)
11399 V = DAG.getNode(ISD::OR, DL, MVT::v16i8, V1, V2);
11401 V = V1InUse ? V1 : V2;
11403 // Cast the result back to the correct type.
11404 return DAG.getBitcast(VT, V);
11407 /// \brief Generic lowering of 8-lane i16 shuffles.
11409 /// This handles both single-input shuffles and combined shuffle/blends with
11410 /// two inputs. The single input shuffles are immediately delegated to
11411 /// a dedicated lowering routine.
11413 /// The blends are lowered in one of three fundamental ways. If there are few
11414 /// enough inputs, it delegates to a basic UNPCK-based strategy. If the shuffle
11415 /// of the input is significantly cheaper when lowered as an interleaving of
11416 /// the two inputs, try to interleave them. Otherwise, blend the low and high
11417 /// halves of the inputs separately (making them have relatively few inputs)
11418 /// and then concatenate them.
11419 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11420 const APInt &Zeroable,
11421 SDValue V1, SDValue V2,
11422 const X86Subtarget &Subtarget,
11423 SelectionDAG &DAG) {
11424 assert(V1.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11425 assert(V2.getSimpleValueType() == MVT::v8i16 && "Bad operand type!");
11426 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
11428 // Whenever we can lower this as a zext, that instruction is strictly faster
11429 // than any alternative.
11430 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11431 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11434 int NumV2Inputs = count_if(Mask, [](int M) { return M >= 8; });
11436 if (NumV2Inputs == 0) {
11437 // Check for being able to broadcast a single element.
11438 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11439 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11442 // Try to use shift instructions.
11443 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V1, Mask,
11444 Zeroable, Subtarget, DAG))
11447 // Use dedicated unpack instructions for masks that match their pattern.
11449 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11452 // Try to use byte rotation instructions.
11453 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
11454 Mask, Subtarget, DAG))
11457 // Make a copy of the mask so it can be modified.
11458 SmallVector<int, 8> MutableMask(Mask.begin(), Mask.end());
11459 return lowerV8I16GeneralSingleInputVectorShuffle(DL, MVT::v8i16, V1,
11460 MutableMask, Subtarget,
11464 assert(llvm::any_of(Mask, [](int M) { return M >= 0 && M < 8; }) &&
11465 "All single-input shuffles should be canonicalized to be V1-input "
11468 // Try to use shift instructions.
11469 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i16, V1, V2, Mask,
11470 Zeroable, Subtarget, DAG))
11473 // See if we can use SSE4A Extraction / Insertion.
11474 if (Subtarget.hasSSE4A())
11475 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v8i16, V1, V2, Mask,
11479 // There are special ways we can lower some single-element blends.
11480 if (NumV2Inputs == 1)
11481 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11482 DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
11485 // We have different paths for blend lowering, but they all must use the
11486 // *exact* same predicate.
11487 bool IsBlendSupported = Subtarget.hasSSE41();
11488 if (IsBlendSupported)
11489 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i16, V1, V2, Mask,
11490 Zeroable, Subtarget, DAG))
11493 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v8i16, V1, V2, Mask,
11497 // Use dedicated unpack instructions for masks that match their pattern.
11499 lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
11502 // Try to use byte rotation instructions.
11503 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11504 DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
11507 if (SDValue BitBlend =
11508 lowerVectorShuffleAsBitBlend(DL, MVT::v8i16, V1, V2, Mask, DAG))
11511 // Try to lower by permuting the inputs into an unpack instruction.
11512 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1,
11516 // If we can't directly blend but can use PSHUFB, that will be better as it
11517 // can both shuffle and set up the inefficient blend.
11518 if (!IsBlendSupported && Subtarget.hasSSSE3()) {
11519 bool V1InUse, V2InUse;
11520 return lowerVectorShuffleAsBlendOfPSHUFBs(DL, MVT::v8i16, V1, V2, Mask,
11521 Zeroable, DAG, V1InUse, V2InUse);
11524 // We can always bit-blend if we have to so the fallback strategy is to
11525 // decompose into single-input permutes and blends.
11526 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i16, V1, V2,
11530 /// \brief Check whether a compaction lowering can be done by dropping even
11531 /// elements and compute how many times even elements must be dropped.
11533 /// This handles shuffles which take every Nth element where N is a power of
11534 /// two. Example shuffle masks:
11536 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 0, 2, 4, 6, 8, 10, 12, 14
11537 /// N = 1: 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
11538 /// N = 2: 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12
11539 /// N = 2: 0, 4, 8, 12, 16, 20, 24, 28, 0, 4, 8, 12, 16, 20, 24, 28
11540 /// N = 3: 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8, 0, 8
11541 /// N = 3: 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24, 0, 8, 16, 24
11543 /// Any of these lanes can of course be undef.
11545 /// This routine only supports N <= 3.
11546 /// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
11549 /// \returns N above, or the number of times even elements must be dropped if
11550 /// there is such a number. Otherwise returns zero.
11551 static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
11552 bool IsSingleInput) {
11553 // The modulus for the shuffle vector entries is based on whether this is
11554 // a single input or not.
11555 int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
11556 assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
11557 "We should only be called with masks with a power-of-2 size!");
11559 uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
11561 // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
11562 // and 2^3 simultaneously. This is because we may have ambiguity with
11563 // partially undef inputs.
11564 bool ViableForN[3] = {true, true, true};
11566 for (int i = 0, e = Mask.size(); i < e; ++i) {
11567 // Ignore undef lanes, we'll optimistically collapse them to the pattern we
11572 bool IsAnyViable = false;
11573 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11574 if (ViableForN[j]) {
11575 uint64_t N = j + 1;
11577 // The shuffle mask must be equal to (i * 2^N) % M.
11578 if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
11579 IsAnyViable = true;
11581 ViableForN[j] = false;
11583 // Early exit if we exhaust the possible powers of two.
11588 for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
11592 // Return 0 as there is no viable power of two.
11596 /// \brief Generic lowering of v16i8 shuffles.
11598 /// This is a hybrid strategy to lower v16i8 vectors. It first attempts to
11599 /// detect any complexity reducing interleaving. If that doesn't help, it uses
11600 /// UNPCK to spread the i8 elements across two i16-element vectors, and uses
11601 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
11603 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11604 const APInt &Zeroable,
11605 SDValue V1, SDValue V2,
11606 const X86Subtarget &Subtarget,
11607 SelectionDAG &DAG) {
11608 assert(V1.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11609 assert(V2.getSimpleValueType() == MVT::v16i8 && "Bad operand type!");
11610 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
11612 // Try to use shift instructions.
11613 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i8, V1, V2, Mask,
11614 Zeroable, Subtarget, DAG))
11617 // Try to use byte rotation instructions.
11618 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
11619 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11622 // Try to use a zext lowering.
11623 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
11624 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11627 // See if we can use SSE4A Extraction / Insertion.
11628 if (Subtarget.hasSSE4A())
11629 if (SDValue V = lowerVectorShuffleWithSSE4A(DL, MVT::v16i8, V1, V2, Mask,
11633 int NumV2Elements = count_if(Mask, [](int M) { return M >= 16; });
11635 // For single-input shuffles, there are some nicer lowering tricks we can use.
11636 if (NumV2Elements == 0) {
11637 // Check for being able to broadcast a single element.
11638 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
11639 DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
11642 // Check whether we can widen this to an i16 shuffle by duplicating bytes.
11643 // Notably, this handles splat and partial-splat shuffles more efficiently.
11644 // However, it only makes sense if the pre-duplication shuffle simplifies
11645 // things significantly. Currently, this means we need to be able to
11646 // express the pre-duplication shuffle as an i16 shuffle.
11648 // FIXME: We should check for other patterns which can be widened into an
11649 // i16 shuffle as well.
11650 auto canWidenViaDuplication = [](ArrayRef<int> Mask) {
11651 for (int i = 0; i < 16; i += 2)
11652 if (Mask[i] >= 0 && Mask[i + 1] >= 0 && Mask[i] != Mask[i + 1])
11657 auto tryToWidenViaDuplication = [&]() -> SDValue {
11658 if (!canWidenViaDuplication(Mask))
11660 SmallVector<int, 4> LoInputs;
11661 copy_if(Mask, std::back_inserter(LoInputs),
11662 [](int M) { return M >= 0 && M < 8; });
11663 std::sort(LoInputs.begin(), LoInputs.end());
11664 LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
11666 SmallVector<int, 4> HiInputs;
11667 copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
11668 std::sort(HiInputs.begin(), HiInputs.end());
11669 HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
11672 bool TargetLo = LoInputs.size() >= HiInputs.size();
11673 ArrayRef<int> InPlaceInputs = TargetLo ? LoInputs : HiInputs;
11674 ArrayRef<int> MovingInputs = TargetLo ? HiInputs : LoInputs;
11676 int PreDupI16Shuffle[] = {-1, -1, -1, -1, -1, -1, -1, -1};
11677 SmallDenseMap<int, int, 8> LaneMap;
11678 for (int I : InPlaceInputs) {
11679 PreDupI16Shuffle[I/2] = I/2;
11682 int j = TargetLo ? 0 : 4, je = j + 4;
11683 for (int i = 0, ie = MovingInputs.size(); i < ie; ++i) {
11684 // Check if j is already a shuffle of this input. This happens when
11685 // there are two adjacent bytes after we move the low one.
11686 if (PreDupI16Shuffle[j] != MovingInputs[i] / 2) {
11687 // If we haven't yet mapped the input, search for a slot into which
11689 while (j < je && PreDupI16Shuffle[j] >= 0)
11693 // We can't place the inputs into a single half with a simple i16 shuffle, so bail.
11696 // Map this input with the i16 shuffle.
11697 PreDupI16Shuffle[j] = MovingInputs[i] / 2;
11700 // Update the lane map based on the mapping we ended up with.
11701 LaneMap[MovingInputs[i]] = 2 * j + MovingInputs[i] % 2;
11703 V1 = DAG.getBitcast(
11705 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11706 DAG.getUNDEF(MVT::v8i16), PreDupI16Shuffle));
11708 // Unpack the bytes to form the i16s that will be shuffled into place.
11709 V1 = DAG.getNode(TargetLo ? X86ISD::UNPCKL : X86ISD::UNPCKH, DL,
11710 MVT::v16i8, V1, V1);
11712 int PostDupI16Shuffle[8] = {-1, -1, -1, -1, -1, -1, -1, -1};
11713 for (int i = 0; i < 16; ++i)
11714 if (Mask[i] >= 0) {
11715 int MappedMask = LaneMap[Mask[i]] - (TargetLo ? 0 : 8);
11716 assert(MappedMask < 8 && "Invalid v8 shuffle mask!");
11717 if (PostDupI16Shuffle[i / 2] < 0)
11718 PostDupI16Shuffle[i / 2] = MappedMask;
11720 assert(PostDupI16Shuffle[i / 2] == MappedMask &&
11721 "Conflicting entries in the original shuffle!");
11723 return DAG.getBitcast(
11725 DAG.getVectorShuffle(MVT::v8i16, DL, DAG.getBitcast(MVT::v8i16, V1),
11726 DAG.getUNDEF(MVT::v8i16), PostDupI16Shuffle));
11728 if (SDValue V = tryToWidenViaDuplication())
11732 if (SDValue Masked = lowerVectorShuffleAsBitMask(DL, MVT::v16i8, V1, V2, Mask,
11736 // Use dedicated unpack instructions for masks that match their pattern.
11738 lowerVectorShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
11741 // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
11742 // with PSHUFB. It is important to do this before we attempt to generate any
11743 // blends but after all of the single-input lowerings. If the single input
11744 // lowerings can find an instruction sequence that is faster than a PSHUFB, we
11745 // want to preserve that and we can DAG combine any longer sequences into
11746 // a PSHUFB in the end. But once we start blending from multiple inputs,
11747 // the complexity of DAG combining bad patterns back into PSHUFB is too high,
11748 // and there are *very* few patterns that would actually be faster than the
11749 // PSHUFB approach because of its ability to zero lanes.
11751 // FIXME: The only exceptions to the above are blends which are exact
11752 // interleavings with direct instructions supporting them. We currently don't
11753 // handle those well here.
11754 if (Subtarget.hasSSSE3()) {
11755 bool V1InUse = false;
11756 bool V2InUse = false;
11758 SDValue PSHUFB = lowerVectorShuffleAsBlendOfPSHUFBs(
11759 DL, MVT::v16i8, V1, V2, Mask, Zeroable, DAG, V1InUse, V2InUse);
11761 // If both V1 and V2 are in use and we can use a direct blend or an unpack,
11762 // do so. This avoids using them to handle blends-with-zero which is
11763 // important as a single pshufb is significantly faster for that.
11764 if (V1InUse && V2InUse) {
11765 if (Subtarget.hasSSE41())
11766 if (SDValue Blend = lowerVectorShuffleAsBlend(
11767 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11770 // We can use an unpack to do the blending rather than an or in some
11771 // cases. Even though the or may be (very minorly) more efficient, we
11772 // preference this lowering because there are common cases where part of
11773 // the complexity of the shuffles goes away when we do the final blend as
11775 // FIXME: It might be worth trying to detect if the unpack-feeding
11776 // shuffles will both be pshufb, in which case we shouldn't bother with
11778 if (SDValue Unpack = lowerVectorShuffleAsPermuteAndUnpack(
11779 DL, MVT::v16i8, V1, V2, Mask, DAG))
11786 // There are special ways we can lower some single-element blends.
11787 if (NumV2Elements == 1)
11788 if (SDValue V = lowerVectorShuffleAsElementInsertion(
11789 DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
11792 if (SDValue BitBlend =
11793 lowerVectorShuffleAsBitBlend(DL, MVT::v16i8, V1, V2, Mask, DAG))
11796 // Check whether a compaction lowering can be done. This handles shuffles
11797 // which take every Nth element for some even N. See the helper function for
11800 // We special case these as they can be particularly efficiently handled with
11801 // the PACKUSB instruction on x86 and they show up in common patterns of
11802 // rearranging bytes to truncate wide elements.
11803 bool IsSingleInput = V2.isUndef();
11804 if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
11805 // NumEvenDrops is the power of two stride of the elements. Another way of
11806 // thinking about it is that we need to drop the even elements this many
11807 // times to get the original input.
11809 // First we need to zero all the dropped bytes.
11810 assert(NumEvenDrops <= 3 &&
11811 "No support for dropping even elements more than 3 times.");
11812 // We use the mask type to pick which bytes are preserved based on how many
11813 // elements are dropped.
11814 MVT MaskVTs[] = { MVT::v8i16, MVT::v4i32, MVT::v2i64 };
11815 SDValue ByteClearMask = DAG.getBitcast(
11816 MVT::v16i8, DAG.getConstant(0xFF, DL, MaskVTs[NumEvenDrops - 1]));
11817 V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
11818 if (!IsSingleInput)
11819 V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
11821 // Now pack things back together.
11822 V1 = DAG.getBitcast(MVT::v8i16, V1);
11823 V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
11824 SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
11825 for (int i = 1; i < NumEvenDrops; ++i) {
11826 Result = DAG.getBitcast(MVT::v8i16, Result);
11827 Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
11833 // Handle multi-input cases by blending single-input shuffles.
11834 if (NumV2Elements > 0)
11835 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v16i8, V1, V2,
11838 // The fallback path for single-input shuffles widens this into two v8i16
11839 // vectors with unpacks, shuffles those, and then pulls them back together
11843 std::array<int, 8> LoBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11844 std::array<int, 8> HiBlendMask = {{-1, -1, -1, -1, -1, -1, -1, -1}};
11845 for (int i = 0; i < 16; ++i)
11847 (i < 8 ? LoBlendMask[i] : HiBlendMask[i % 8]) = Mask[i];
11849 SDValue VLoHalf, VHiHalf;
11850 // Check if any of the odd lanes in the v16i8 are used. If not, we can mask
11851 // them out and avoid using UNPCK{L,H} to extract the elements of V as
11853 if (none_of(LoBlendMask, [](int M) { return M >= 0 && M % 2 == 1; }) &&
11854 none_of(HiBlendMask, [](int M) { return M >= 0 && M % 2 == 1; })) {
11855 // Use a mask to drop the high bytes.
11856 VLoHalf = DAG.getBitcast(MVT::v8i16, V);
11857 VLoHalf = DAG.getNode(ISD::AND, DL, MVT::v8i16, VLoHalf,
11858 DAG.getConstant(0x00FF, DL, MVT::v8i16));
11860 // This will be a single vector shuffle instead of a blend so nuke VHiHalf.
11861 VHiHalf = DAG.getUNDEF(MVT::v8i16);
11863 // Squash the masks to point directly into VLoHalf.
11864 for (int &M : LoBlendMask)
11867 for (int &M : HiBlendMask)
11871 // Otherwise just unpack the low half of V into VLoHalf and the high half into
11872 // VHiHalf so that we can blend them as i16s.
11873 SDValue Zero = getZeroVector(MVT::v16i8, Subtarget, DAG, DL);
11875 VLoHalf = DAG.getBitcast(
11876 MVT::v8i16, DAG.getNode(X86ISD::UNPCKL, DL, MVT::v16i8, V, Zero));
11877 VHiHalf = DAG.getBitcast(
11878 MVT::v8i16, DAG.getNode(X86ISD::UNPCKH, DL, MVT::v16i8, V, Zero));
11881 SDValue LoV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, LoBlendMask);
11882 SDValue HiV = DAG.getVectorShuffle(MVT::v8i16, DL, VLoHalf, VHiHalf, HiBlendMask);
11884 return DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, LoV, HiV);
11887 /// \brief Dispatching routine to lower various 128-bit x86 vector shuffles.
11889 /// This routine breaks down the specific type of 128-bit shuffle and
11890 /// dispatches to the lowering routines accordingly.
11891 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
11892 MVT VT, SDValue V1, SDValue V2,
11893 const APInt &Zeroable,
11894 const X86Subtarget &Subtarget,
11895 SelectionDAG &DAG) {
11896 switch (VT.SimpleTy) {
11898 return lowerV2I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11900 return lowerV2F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11902 return lowerV4I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11904 return lowerV4F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11906 return lowerV8I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11908 return lowerV16I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
11911 llvm_unreachable("Unimplemented!");
11915 /// \brief Generic routine to split vector shuffle into half-sized shuffles.
11917 /// This routine just extracts two subvectors, shuffles them independently, and
11918 /// then concatenates them back together. This should work effectively with all
11919 /// AVX vector shuffle types.
11920 static SDValue splitAndLowerVectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
11921 SDValue V2, ArrayRef<int> Mask,
11922 SelectionDAG &DAG) {
11923 assert(VT.getSizeInBits() >= 256 &&
11924 "Only for 256-bit or wider vector shuffles!");
11925 assert(V1.getSimpleValueType() == VT && "Bad operand type!");
11926 assert(V2.getSimpleValueType() == VT && "Bad operand type!");
11928 ArrayRef<int> LoMask = Mask.slice(0, Mask.size() / 2);
11929 ArrayRef<int> HiMask = Mask.slice(Mask.size() / 2);
11931 int NumElements = VT.getVectorNumElements();
11932 int SplitNumElements = NumElements / 2;
11933 MVT ScalarVT = VT.getVectorElementType();
11934 MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
11936 // Rather than splitting build-vectors, just build two narrower build
11937 // vectors. This helps shuffling with splats and zeros.
11938 auto SplitVector = [&](SDValue V) {
11939 V = peekThroughBitcasts(V);
11941 MVT OrigVT = V.getSimpleValueType();
11942 int OrigNumElements = OrigVT.getVectorNumElements();
11943 int OrigSplitNumElements = OrigNumElements / 2;
11944 MVT OrigScalarVT = OrigVT.getVectorElementType();
11945 MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
11949 auto *BV = dyn_cast<BuildVectorSDNode>(V);
11951 LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11952 DAG.getIntPtrConstant(0, DL));
11953 HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
11954 DAG.getIntPtrConstant(OrigSplitNumElements, DL));
11957 SmallVector<SDValue, 16> LoOps, HiOps;
11958 for (int i = 0; i < OrigSplitNumElements; ++i) {
11959 LoOps.push_back(BV->getOperand(i));
11960 HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
11962 LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
11963 HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
11965 return std::make_pair(DAG.getBitcast(SplitVT, LoV),
11966 DAG.getBitcast(SplitVT, HiV));
11969 SDValue LoV1, HiV1, LoV2, HiV2;
11970 std::tie(LoV1, HiV1) = SplitVector(V1);
11971 std::tie(LoV2, HiV2) = SplitVector(V2);
11973 // Now create two 4-way blends of these half-width vectors.
11974 auto HalfBlend = [&](ArrayRef<int> HalfMask) {
11975 bool UseLoV1 = false, UseHiV1 = false, UseLoV2 = false, UseHiV2 = false;
11976 SmallVector<int, 32> V1BlendMask((unsigned)SplitNumElements, -1);
11977 SmallVector<int, 32> V2BlendMask((unsigned)SplitNumElements, -1);
11978 SmallVector<int, 32> BlendMask((unsigned)SplitNumElements, -1);
11979 for (int i = 0; i < SplitNumElements; ++i) {
11980 int M = HalfMask[i];
11981 if (M >= NumElements) {
11982 if (M >= NumElements + SplitNumElements)
11986 V2BlendMask[i] = M - NumElements;
11987 BlendMask[i] = SplitNumElements + i;
11988 } else if (M >= 0) {
11989 if (M >= SplitNumElements)
11993 V1BlendMask[i] = M;
11998 // Because the lowering happens after all combining takes place, we need to
11999 // manually combine these blend masks as much as possible so that we create
12000 // a minimal number of high-level vector shuffle nodes.
12002 // First try just blending the halves of V1 or V2.
12003 if (!UseLoV1 && !UseHiV1 && !UseLoV2 && !UseHiV2)
12004 return DAG.getUNDEF(SplitVT);
12005 if (!UseLoV2 && !UseHiV2)
12006 return DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12007 if (!UseLoV1 && !UseHiV1)
12008 return DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12010 SDValue V1Blend, V2Blend;
12011 if (UseLoV1 && UseHiV1) {
12013 DAG.getVectorShuffle(SplitVT, DL, LoV1, HiV1, V1BlendMask);
12015 // We only use half of V1 so map the usage down into the final blend mask.
12016 V1Blend = UseLoV1 ? LoV1 : HiV1;
12017 for (int i = 0; i < SplitNumElements; ++i)
12018 if (BlendMask[i] >= 0 && BlendMask[i] < SplitNumElements)
12019 BlendMask[i] = V1BlendMask[i] - (UseLoV1 ? 0 : SplitNumElements);
12021 if (UseLoV2 && UseHiV2) {
12023 DAG.getVectorShuffle(SplitVT, DL, LoV2, HiV2, V2BlendMask);
12025 // We only use half of V2 so map the usage down into the final blend mask.
12026 V2Blend = UseLoV2 ? LoV2 : HiV2;
12027 for (int i = 0; i < SplitNumElements; ++i)
12028 if (BlendMask[i] >= SplitNumElements)
12029 BlendMask[i] = V2BlendMask[i] + (UseLoV2 ? SplitNumElements : 0);
12031 return DAG.getVectorShuffle(SplitVT, DL, V1Blend, V2Blend, BlendMask);
12033 SDValue Lo = HalfBlend(LoMask);
12034 SDValue Hi = HalfBlend(HiMask);
12035 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
12038 /// \brief Either split a vector in halves or decompose the shuffles and the
12041 /// This is provided as a good fallback for many lowerings of non-single-input
12042 /// shuffles with more than one 128-bit lane. In those cases, we want to select
12043 /// between splitting the shuffle into 128-bit components and stitching those
12044 /// back together vs. extracting the single-input shuffles and blending those
12046 static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
12047 SDValue V1, SDValue V2,
12048 ArrayRef<int> Mask,
12049 SelectionDAG &DAG) {
12050 assert(!V2.isUndef() && "This routine must not be used to lower single-input "
12051 "shuffles as it could then recurse on itself.");
12052 int Size = Mask.size();
12054 // If this can be modeled as a broadcast of two elements followed by a blend,
12055 // prefer that lowering. This is especially important because broadcasts can
12056 // often fold with memory operands.
12057 auto DoBothBroadcast = [&] {
12058 int V1BroadcastIdx = -1, V2BroadcastIdx = -1;
12061 if (V2BroadcastIdx < 0)
12062 V2BroadcastIdx = M - Size;
12063 else if (M - Size != V2BroadcastIdx)
12065 } else if (M >= 0) {
12066 if (V1BroadcastIdx < 0)
12067 V1BroadcastIdx = M;
12068 else if (M != V1BroadcastIdx)
12073 if (DoBothBroadcast())
12074 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask,
12077 // If the inputs all stem from a single 128-bit lane of each input, then we
12078 // split them rather than blending because the split will decompose to
12079 // unusually few instructions.
12080 int LaneCount = VT.getSizeInBits() / 128;
12081 int LaneSize = Size / LaneCount;
12082 SmallBitVector LaneInputs[2];
12083 LaneInputs[0].resize(LaneCount, false);
12084 LaneInputs[1].resize(LaneCount, false);
12085 for (int i = 0; i < Size; ++i)
12087 LaneInputs[Mask[i] / Size][(Mask[i] % Size) / LaneSize] = true;
12088 if (LaneInputs[0].count() <= 1 && LaneInputs[1].count() <= 1)
12089 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12091 // Otherwise, just fall back to decomposed shuffles and a blend. This requires
12092 // that the decomposed single-input shuffles don't end up here.
12093 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, VT, V1, V2, Mask, DAG);
12096 /// \brief Lower a vector shuffle crossing multiple 128-bit lanes as
12097 /// a permutation and blend of those lanes.
12099 /// This essentially blends the out-of-lane inputs to each lane into the lane
12100 /// from a permuted copy of the vector. This lowering strategy results in four
12101 /// instructions in the worst case for a single-input cross lane shuffle which
12102 /// is lower than any other fully general cross-lane shuffle strategy I'm aware
12103 /// of. Special cases for each particular shuffle pattern should be handled
12104 /// prior to trying this lowering.
12105 static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
12106 SDValue V1, SDValue V2,
12107 ArrayRef<int> Mask,
12108 SelectionDAG &DAG) {
12109 // FIXME: This should probably be generalized for 512-bit vectors as well.
12110 assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
12111 int Size = Mask.size();
12112 int LaneSize = Size / 2;
12114 // If there are only inputs from one 128-bit lane, splitting will in fact be
12115 // less expensive. The flags track whether the given lane contains an element
12116 // that crosses to another lane.
12117 bool LaneCrossing[2] = {false, false};
12118 for (int i = 0; i < Size; ++i)
12119 if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
12120 LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
12121 if (!LaneCrossing[0] || !LaneCrossing[1])
12122 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
12124 assert(V2.isUndef() &&
12125 "This last part of this routine only works on single input shuffles");
12127 SmallVector<int, 32> FlippedBlendMask(Size);
12128 for (int i = 0; i < Size; ++i)
12129 FlippedBlendMask[i] =
12130 Mask[i] < 0 ? -1 : (((Mask[i] % Size) / LaneSize == i / LaneSize)
12132 : Mask[i] % LaneSize +
12133 (i / LaneSize) * LaneSize + Size);
12135 // Flip the vector, and blend the results which should now be in-lane. The
12136 // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
12137 // 5 for the high source. The value 3 selects the high half of source 2 and
12138 // the value 2 selects the low half of source 2. We only use source 2 to
12139 // allow folding it into a memory operand.
12140 unsigned PERMMask = 3 | 2 << 4;
12141 SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
12142 V1, DAG.getConstant(PERMMask, DL, MVT::i8));
12143 return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
12146 /// \brief Handle lowering 2-lane 128-bit shuffles.
12147 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
12148 SDValue V2, ArrayRef<int> Mask,
12149 const APInt &Zeroable,
12150 const X86Subtarget &Subtarget,
12151 SelectionDAG &DAG) {
12152 SmallVector<int, 4> WidenedMask;
12153 if (!canWidenShuffleElements(Mask, WidenedMask))
12156 // TODO: If minimizing size and one of the inputs is a zero vector and the
12157 // the zero vector has only one use, we could use a VPERM2X128 to save the
12158 // instruction bytes needed to explicitly generate the zero vector.
12160 // Blends are faster and handle all the non-lane-crossing cases.
12161 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, VT, V1, V2, Mask,
12162 Zeroable, Subtarget, DAG))
12165 bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
12166 bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
12168 // If either input operand is a zero vector, use VPERM2X128 because its mask
12169 // allows us to replace the zero input with an implicit zero.
12170 if (!IsV1Zero && !IsV2Zero) {
12171 // Check for patterns which can be matched with a single insert of a 128-bit
12173 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
12174 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
12175 // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
12176 if (Subtarget.hasAVX2() && V2.isUndef())
12179 // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
12180 // this will likely become vinsertf128 which can't fold a 256-bit memop.
12181 if (!isa<LoadSDNode>(peekThroughBitcasts(V1))) {
12182 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(),
12183 VT.getVectorNumElements() / 2);
12184 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
12185 DAG.getIntPtrConstant(0, DL));
12186 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
12187 OnlyUsesV1 ? V1 : V2,
12188 DAG.getIntPtrConstant(0, DL));
12189 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
12194 // Otherwise form a 128-bit permutation. After accounting for undefs,
12195 // convert the 64-bit shuffle mask selection values into 128-bit
12196 // selection bits by dividing the indexes by 2 and shifting into positions
12197 // defined by a vperm2*128 instruction's immediate control byte.
12199 // The immediate permute control byte looks like this:
12200 // [1:0] - select 128 bits from sources for low half of destination
12202 // [3] - zero low half of destination
12203 // [5:4] - select 128 bits from sources for high half of destination
12205 // [7] - zero high half of destination
12207 int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
12208 int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
12210 unsigned PermMask = MaskLO | (MaskHI << 4);
12212 // If either input is a zero vector, replace it with an undef input.
12213 // Shuffle mask values < 4 are selecting elements of V1.
12214 // Shuffle mask values >= 4 are selecting elements of V2.
12215 // Adjust each half of the permute mask by clearing the half that was
12216 // selecting the zero vector and setting the zero mask bit.
12218 V1 = DAG.getUNDEF(VT);
12220 PermMask = (PermMask & 0xf0) | 0x08;
12222 PermMask = (PermMask & 0x0f) | 0x80;
12225 V2 = DAG.getUNDEF(VT);
12227 PermMask = (PermMask & 0xf0) | 0x08;
12229 PermMask = (PermMask & 0x0f) | 0x80;
12232 return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
12233 DAG.getConstant(PermMask, DL, MVT::i8));
12236 /// \brief Lower a vector shuffle by first fixing the 128-bit lanes and then
12237 /// shuffling each lane.
12239 /// This will only succeed when the result of fixing the 128-bit lanes results
12240 /// in a single-input non-lane-crossing shuffle with a repeating shuffle mask in
12241 /// each 128-bit lanes. This handles many cases where we can quickly blend away
12242 /// the lane crosses early and then use simpler shuffles within each lane.
12244 /// FIXME: It might be worthwhile at some point to support this without
12245 /// requiring the 128-bit lane-relative shuffles to be repeating, but currently
12246 /// in x86 only floating point has interesting non-repeating shuffles, and even
12247 /// those are still *marginally* more expensive.
12248 static SDValue lowerVectorShuffleByMerging128BitLanes(
12249 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12250 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12251 assert(!V2.isUndef() && "This is only useful with multiple inputs.");
12253 int Size = Mask.size();
12254 int LaneSize = 128 / VT.getScalarSizeInBits();
12255 int NumLanes = Size / LaneSize;
12256 assert(NumLanes > 1 && "Only handles 256-bit and wider shuffles.");
12258 // See if we can build a hypothetical 128-bit lane-fixing shuffle mask. Also
12259 // check whether the in-128-bit lane shuffles share a repeating pattern.
12260 SmallVector<int, 4> Lanes((unsigned)NumLanes, -1);
12261 SmallVector<int, 4> InLaneMask((unsigned)LaneSize, -1);
12262 for (int i = 0; i < Size; ++i) {
12266 int j = i / LaneSize;
12268 if (Lanes[j] < 0) {
12269 // First entry we've seen for this lane.
12270 Lanes[j] = Mask[i] / LaneSize;
12271 } else if (Lanes[j] != Mask[i] / LaneSize) {
12272 // This doesn't match the lane selected previously!
12276 // Check that within each lane we have a consistent shuffle mask.
12277 int k = i % LaneSize;
12278 if (InLaneMask[k] < 0) {
12279 InLaneMask[k] = Mask[i] % LaneSize;
12280 } else if (InLaneMask[k] != Mask[i] % LaneSize) {
12281 // This doesn't fit a repeating in-lane mask.
12286 // First shuffle the lanes into place.
12287 MVT LaneVT = MVT::getVectorVT(VT.isFloatingPoint() ? MVT::f64 : MVT::i64,
12288 VT.getSizeInBits() / 64);
12289 SmallVector<int, 8> LaneMask((unsigned)NumLanes * 2, -1);
12290 for (int i = 0; i < NumLanes; ++i)
12291 if (Lanes[i] >= 0) {
12292 LaneMask[2 * i + 0] = 2*Lanes[i] + 0;
12293 LaneMask[2 * i + 1] = 2*Lanes[i] + 1;
12296 V1 = DAG.getBitcast(LaneVT, V1);
12297 V2 = DAG.getBitcast(LaneVT, V2);
12298 SDValue LaneShuffle = DAG.getVectorShuffle(LaneVT, DL, V1, V2, LaneMask);
12300 // Cast it back to the type we actually want.
12301 LaneShuffle = DAG.getBitcast(VT, LaneShuffle);
12303 // Now do a simple shuffle that isn't lane crossing.
12304 SmallVector<int, 8> NewMask((unsigned)Size, -1);
12305 for (int i = 0; i < Size; ++i)
12307 NewMask[i] = (i / LaneSize) * LaneSize + Mask[i] % LaneSize;
12308 assert(!is128BitLaneCrossingShuffleMask(VT, NewMask) &&
12309 "Must not introduce lane crosses at this point!");
12311 return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
12314 /// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
12315 /// This allows for fast cases such as subvector extraction/insertion
12316 /// or shuffling smaller vector types which can lower more efficiently.
12317 static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
12318 SDValue V1, SDValue V2,
12319 ArrayRef<int> Mask,
12320 const X86Subtarget &Subtarget,
12321 SelectionDAG &DAG) {
12322 assert(VT.is256BitVector() && "Expected 256-bit vector");
12324 unsigned NumElts = VT.getVectorNumElements();
12325 unsigned HalfNumElts = NumElts / 2;
12326 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(), HalfNumElts);
12328 bool UndefLower = isUndefInRange(Mask, 0, HalfNumElts);
12329 bool UndefUpper = isUndefInRange(Mask, HalfNumElts, HalfNumElts);
12330 if (!UndefLower && !UndefUpper)
12333 // Upper half is undef and lower half is whole upper subvector.
12334 // e.g. vector_shuffle <4, 5, 6, 7, u, u, u, u> or <2, 3, u, u>
12336 isSequentialOrUndefInRange(Mask, 0, HalfNumElts, HalfNumElts)) {
12337 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12338 DAG.getIntPtrConstant(HalfNumElts, DL));
12339 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12340 DAG.getIntPtrConstant(0, DL));
12343 // Lower half is undef and upper half is whole lower subvector.
12344 // e.g. vector_shuffle <u, u, u, u, 0, 1, 2, 3> or <u, u, 0, 1>
12346 isSequentialOrUndefInRange(Mask, HalfNumElts, HalfNumElts, 0)) {
12347 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V1,
12348 DAG.getIntPtrConstant(0, DL));
12349 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), Hi,
12350 DAG.getIntPtrConstant(HalfNumElts, DL));
12353 // If the shuffle only uses two of the four halves of the input operands,
12354 // then extract them and perform the 'half' shuffle at half width.
12355 // e.g. vector_shuffle <X, X, X, X, u, u, u, u> or <X, X, u, u>
12356 int HalfIdx1 = -1, HalfIdx2 = -1;
12357 SmallVector<int, 8> HalfMask(HalfNumElts);
12358 unsigned Offset = UndefLower ? HalfNumElts : 0;
12359 for (unsigned i = 0; i != HalfNumElts; ++i) {
12360 int M = Mask[i + Offset];
12366 // Determine which of the 4 half vectors this element is from.
12367 // i.e. 0 = Lower V1, 1 = Upper V1, 2 = Lower V2, 3 = Upper V2.
12368 int HalfIdx = M / HalfNumElts;
12370 // Determine the element index into its half vector source.
12371 int HalfElt = M % HalfNumElts;
12373 // We can shuffle with up to 2 half vectors, set the new 'half'
12374 // shuffle mask accordingly.
12375 if (HalfIdx1 < 0 || HalfIdx1 == HalfIdx) {
12376 HalfMask[i] = HalfElt;
12377 HalfIdx1 = HalfIdx;
12380 if (HalfIdx2 < 0 || HalfIdx2 == HalfIdx) {
12381 HalfMask[i] = HalfElt + HalfNumElts;
12382 HalfIdx2 = HalfIdx;
12386 // Too many half vectors referenced.
12389 assert(HalfMask.size() == HalfNumElts && "Unexpected shuffle mask length");
12391 // Only shuffle the halves of the inputs when useful.
12392 int NumLowerHalves =
12393 (HalfIdx1 == 0 || HalfIdx1 == 2) + (HalfIdx2 == 0 || HalfIdx2 == 2);
12394 int NumUpperHalves =
12395 (HalfIdx1 == 1 || HalfIdx1 == 3) + (HalfIdx2 == 1 || HalfIdx2 == 3);
12397 // uuuuXXXX - don't extract uppers just to insert again.
12398 if (UndefLower && NumUpperHalves != 0)
12401 // XXXXuuuu - don't extract both uppers, instead shuffle and then extract.
12402 if (UndefUpper && NumUpperHalves == 2)
12405 // AVX2 - XXXXuuuu - always extract lowers.
12406 if (Subtarget.hasAVX2() && !(UndefUpper && NumUpperHalves == 0)) {
12407 // AVX2 supports efficient immediate 64-bit element cross-lane shuffles.
12408 if (VT == MVT::v4f64 || VT == MVT::v4i64)
12410 // AVX2 supports variable 32-bit element cross-lane shuffles.
12411 if (VT == MVT::v8f32 || VT == MVT::v8i32) {
12412 // XXXXuuuu - don't extract lowers and uppers.
12413 if (UndefUpper && NumLowerHalves != 0 && NumUpperHalves != 0)
12418 auto GetHalfVector = [&](int HalfIdx) {
12420 return DAG.getUNDEF(HalfVT);
12421 SDValue V = (HalfIdx < 2 ? V1 : V2);
12422 HalfIdx = (HalfIdx % 2) * HalfNumElts;
12423 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, HalfVT, V,
12424 DAG.getIntPtrConstant(HalfIdx, DL));
12427 SDValue Half1 = GetHalfVector(HalfIdx1);
12428 SDValue Half2 = GetHalfVector(HalfIdx2);
12429 SDValue V = DAG.getVectorShuffle(HalfVT, DL, Half1, Half2, HalfMask);
12430 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V,
12431 DAG.getIntPtrConstant(Offset, DL));
12434 /// \brief Test whether the specified input (0 or 1) is in-place blended by the
12437 /// This returns true if the elements from a particular input are already in the
12438 /// slot required by the given mask and require no permutation.
12439 static bool isShuffleMaskInputInPlace(int Input, ArrayRef<int> Mask) {
12440 assert((Input == 0 || Input == 1) && "Only two inputs to shuffles.");
12441 int Size = Mask.size();
12442 for (int i = 0; i < Size; ++i)
12443 if (Mask[i] >= 0 && Mask[i] / Size == Input && Mask[i] % Size != i)
12449 /// Handle case where shuffle sources are coming from the same 128-bit lane and
12450 /// every lane can be represented as the same repeating mask - allowing us to
12451 /// shuffle the sources with the repeating shuffle and then permute the result
12452 /// to the destination lanes.
12453 static SDValue lowerShuffleAsRepeatedMaskAndLanePermute(
12454 const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
12455 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
12456 int NumElts = VT.getVectorNumElements();
12457 int NumLanes = VT.getSizeInBits() / 128;
12458 int NumLaneElts = NumElts / NumLanes;
12460 // On AVX2 we may be able to just shuffle the lowest elements and then
12461 // broadcast the result.
12462 if (Subtarget.hasAVX2()) {
12463 for (unsigned BroadcastSize : {16, 32, 64}) {
12464 if (BroadcastSize <= VT.getScalarSizeInBits())
12466 int NumBroadcastElts = BroadcastSize / VT.getScalarSizeInBits();
12468 // Attempt to match a repeating pattern every NumBroadcastElts,
12469 // accounting for UNDEFs but only references the lowest 128-bit
12470 // lane of the inputs.
12471 auto FindRepeatingBroadcastMask = [&](SmallVectorImpl<int> &RepeatMask) {
12472 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12473 for (int j = 0; j != NumBroadcastElts; ++j) {
12474 int M = Mask[i + j];
12477 int &R = RepeatMask[j];
12478 if (0 != ((M % NumElts) / NumLaneElts))
12480 if (0 <= R && R != M)
12487 SmallVector<int, 8> RepeatMask((unsigned)NumElts, -1);
12488 if (!FindRepeatingBroadcastMask(RepeatMask))
12491 // Shuffle the (lowest) repeated elements in place for broadcast.
12492 SDValue RepeatShuf = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatMask);
12494 // Shuffle the actual broadcast.
12495 SmallVector<int, 8> BroadcastMask((unsigned)NumElts, -1);
12496 for (int i = 0; i != NumElts; i += NumBroadcastElts)
12497 for (int j = 0; j != NumBroadcastElts; ++j)
12498 BroadcastMask[i + j] = j;
12499 return DAG.getVectorShuffle(VT, DL, RepeatShuf, DAG.getUNDEF(VT),
12504 // Bail if the shuffle mask doesn't cross 128-bit lanes.
12505 if (!is128BitLaneCrossingShuffleMask(VT, Mask))
12508 // Bail if we already have a repeated lane shuffle mask.
12509 SmallVector<int, 8> RepeatedShuffleMask;
12510 if (is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedShuffleMask))
12513 // On AVX2 targets we can permute 256-bit vectors as 64-bit sub-lanes
12514 // (with PERMQ/PERMPD), otherwise we can only permute whole 128-bit lanes.
12515 int SubLaneScale = Subtarget.hasAVX2() && VT.is256BitVector() ? 2 : 1;
12516 int NumSubLanes = NumLanes * SubLaneScale;
12517 int NumSubLaneElts = NumLaneElts / SubLaneScale;
12519 // Check that all the sources are coming from the same lane and see if we can
12520 // form a repeating shuffle mask (local to each sub-lane). At the same time,
12521 // determine the source sub-lane for each destination sub-lane.
12522 int TopSrcSubLane = -1;
12523 SmallVector<int, 8> Dst2SrcSubLanes((unsigned)NumSubLanes, -1);
12524 SmallVector<int, 8> RepeatedSubLaneMasks[2] = {
12525 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef),
12526 SmallVector<int, 8>((unsigned)NumSubLaneElts, SM_SentinelUndef)};
12528 for (int DstSubLane = 0; DstSubLane != NumSubLanes; ++DstSubLane) {
12529 // Extract the sub-lane mask, check that it all comes from the same lane
12530 // and normalize the mask entries to come from the first lane.
12532 SmallVector<int, 8> SubLaneMask((unsigned)NumSubLaneElts, -1);
12533 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12534 int M = Mask[(DstSubLane * NumSubLaneElts) + Elt];
12537 int Lane = (M % NumElts) / NumLaneElts;
12538 if ((0 <= SrcLane) && (SrcLane != Lane))
12541 int LocalM = (M % NumLaneElts) + (M < NumElts ? 0 : NumElts);
12542 SubLaneMask[Elt] = LocalM;
12545 // Whole sub-lane is UNDEF.
12549 // Attempt to match against the candidate repeated sub-lane masks.
12550 for (int SubLane = 0; SubLane != SubLaneScale; ++SubLane) {
12551 auto MatchMasks = [NumSubLaneElts](ArrayRef<int> M1, ArrayRef<int> M2) {
12552 for (int i = 0; i != NumSubLaneElts; ++i) {
12553 if (M1[i] < 0 || M2[i] < 0)
12555 if (M1[i] != M2[i])
12561 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane];
12562 if (!MatchMasks(SubLaneMask, RepeatedSubLaneMask))
12565 // Merge the sub-lane mask into the matching repeated sub-lane mask.
12566 for (int i = 0; i != NumSubLaneElts; ++i) {
12567 int M = SubLaneMask[i];
12570 assert((RepeatedSubLaneMask[i] < 0 || RepeatedSubLaneMask[i] == M) &&
12571 "Unexpected mask element");
12572 RepeatedSubLaneMask[i] = M;
12575 // Track the top most source sub-lane - by setting the remaining to UNDEF
12576 // we can greatly simplify shuffle matching.
12577 int SrcSubLane = (SrcLane * SubLaneScale) + SubLane;
12578 TopSrcSubLane = std::max(TopSrcSubLane, SrcSubLane);
12579 Dst2SrcSubLanes[DstSubLane] = SrcSubLane;
12583 // Bail if we failed to find a matching repeated sub-lane mask.
12584 if (Dst2SrcSubLanes[DstSubLane] < 0)
12587 assert(0 <= TopSrcSubLane && TopSrcSubLane < NumSubLanes &&
12588 "Unexpected source lane");
12590 // Create a repeating shuffle mask for the entire vector.
12591 SmallVector<int, 8> RepeatedMask((unsigned)NumElts, -1);
12592 for (int SubLane = 0; SubLane <= TopSrcSubLane; ++SubLane) {
12593 int Lane = SubLane / SubLaneScale;
12594 auto &RepeatedSubLaneMask = RepeatedSubLaneMasks[SubLane % SubLaneScale];
12595 for (int Elt = 0; Elt != NumSubLaneElts; ++Elt) {
12596 int M = RepeatedSubLaneMask[Elt];
12599 int Idx = (SubLane * NumSubLaneElts) + Elt;
12600 RepeatedMask[Idx] = M + (Lane * NumLaneElts);
12603 SDValue RepeatedShuffle = DAG.getVectorShuffle(VT, DL, V1, V2, RepeatedMask);
12605 // Shuffle each source sub-lane to its destination.
12606 SmallVector<int, 8> SubLaneMask((unsigned)NumElts, -1);
12607 for (int i = 0; i != NumElts; i += NumSubLaneElts) {
12608 int SrcSubLane = Dst2SrcSubLanes[i / NumSubLaneElts];
12609 if (SrcSubLane < 0)
12611 for (int j = 0; j != NumSubLaneElts; ++j)
12612 SubLaneMask[i + j] = j + (SrcSubLane * NumSubLaneElts);
12615 return DAG.getVectorShuffle(VT, DL, RepeatedShuffle, DAG.getUNDEF(VT),
12619 static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
12620 unsigned &ShuffleImm,
12621 ArrayRef<int> Mask) {
12622 int NumElts = VT.getVectorNumElements();
12623 assert(VT.getScalarSizeInBits() == 64 &&
12624 (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
12625 "Unexpected data type for VSHUFPD");
12627 // Mask for V8F64: 0/1, 8/9, 2/3, 10/11, 4/5, ..
12628 // Mask for V4F64; 0/1, 4/5, 2/3, 6/7..
12630 bool ShufpdMask = true;
12631 bool CommutableMask = true;
12632 for (int i = 0; i < NumElts; ++i) {
12633 if (Mask[i] == SM_SentinelUndef)
12637 int Val = (i & 6) + NumElts * (i & 1);
12638 int CommutVal = (i & 0xe) + NumElts * ((i & 1) ^ 1);
12639 if (Mask[i] < Val || Mask[i] > Val + 1)
12640 ShufpdMask = false;
12641 if (Mask[i] < CommutVal || Mask[i] > CommutVal + 1)
12642 CommutableMask = false;
12643 ShuffleImm |= (Mask[i] % 2) << i;
12648 if (CommutableMask) {
12656 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
12657 ArrayRef<int> Mask, SDValue V1,
12658 SDValue V2, SelectionDAG &DAG) {
12659 assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
12660 "Unexpected data type for VSHUFPD");
12662 unsigned Immediate = 0;
12663 if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
12666 return DAG.getNode(X86ISD::SHUFP, DL, VT, V1, V2,
12667 DAG.getConstant(Immediate, DL, MVT::i8));
12670 static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
12671 ArrayRef<int> Mask, SDValue V1,
12672 SDValue V2, SelectionDAG &DAG) {
12673 MVT MaskEltVT = MVT::getIntegerVT(VT.getScalarSizeInBits());
12674 MVT MaskVecVT = MVT::getVectorVT(MaskEltVT, VT.getVectorNumElements());
12676 SDValue MaskNode = getConstVector(Mask, MaskVecVT, DAG, DL, true);
12678 return DAG.getNode(X86ISD::VPERMV, DL, VT, MaskNode, V1);
12680 return DAG.getNode(X86ISD::VPERMV3, DL, VT, V1, MaskNode, V2);
12683 /// \brief Handle lowering of 4-lane 64-bit floating point shuffles.
12685 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
12686 /// isn't available.
12687 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12688 const APInt &Zeroable,
12689 SDValue V1, SDValue V2,
12690 const X86Subtarget &Subtarget,
12691 SelectionDAG &DAG) {
12692 assert(V1.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12693 assert(V2.getSimpleValueType() == MVT::v4f64 && "Bad operand type!");
12694 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12696 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4f64, V1, V2, Mask,
12697 Zeroable, Subtarget, DAG))
12700 if (V2.isUndef()) {
12701 // Check for being able to broadcast a single element.
12702 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(
12703 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12706 // Use low duplicate instructions for masks that match their pattern.
12707 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2}))
12708 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v4f64, V1);
12710 if (!is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask)) {
12711 // Non-half-crossing single input shuffles can be lowered with an
12712 // interleaved permutation.
12713 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
12714 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3);
12715 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v4f64, V1,
12716 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
12719 // With AVX2 we have direct support for this permutation.
12720 if (Subtarget.hasAVX2())
12721 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4f64, V1,
12722 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12724 // Try to create an in-lane repeating shuffle mask and then shuffle the
12725 // the results into the target lanes.
12726 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12727 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12730 // Otherwise, fall back.
12731 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
12735 // Use dedicated unpack instructions for masks that match their pattern.
12737 lowerVectorShuffleWithUNPCK(DL, MVT::v4f64, Mask, V1, V2, DAG))
12740 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4f64, V1, V2, Mask,
12741 Zeroable, Subtarget, DAG))
12744 // Check if the blend happens to exactly fit that of SHUFPD.
12746 lowerVectorShuffleWithSHUFPD(DL, MVT::v4f64, Mask, V1, V2, DAG))
12749 // Try to create an in-lane repeating shuffle mask and then shuffle the
12750 // the results into the target lanes.
12751 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12752 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12755 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12756 // shuffle. However, if we have AVX2 and either inputs are already in place,
12757 // we will be able to shuffle even across lanes the other input in a single
12758 // instruction so skip this pattern.
12759 if (!(Subtarget.hasAVX2() && (isShuffleMaskInputInPlace(0, Mask) ||
12760 isShuffleMaskInputInPlace(1, Mask))))
12761 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12762 DL, MVT::v4f64, V1, V2, Mask, Subtarget, DAG))
12764 // If we have VLX support, we can use VEXPAND.
12765 if (Subtarget.hasVLX())
12766 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask,
12767 V1, V2, DAG, Subtarget))
12770 // If we have AVX2 then we always want to lower with a blend because an v4 we
12771 // can fully permute the elements.
12772 if (Subtarget.hasAVX2())
12773 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4f64, V1, V2,
12776 // Otherwise fall back on generic lowering.
12777 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v4f64, V1, V2, Mask, DAG);
12780 /// \brief Handle lowering of 4-lane 64-bit integer shuffles.
12782 /// This routine is only called when we have AVX2 and thus a reasonable
12783 /// instruction set for v4i64 shuffling..
12784 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12785 const APInt &Zeroable,
12786 SDValue V1, SDValue V2,
12787 const X86Subtarget &Subtarget,
12788 SelectionDAG &DAG) {
12789 assert(V1.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12790 assert(V2.getSimpleValueType() == MVT::v4i64 && "Bad operand type!");
12791 assert(Mask.size() == 4 && "Unexpected mask size for v4 shuffle!");
12792 assert(Subtarget.hasAVX2() && "We can only lower v4i64 with AVX2!");
12794 if (SDValue V = lowerV2X128VectorShuffle(DL, MVT::v4i64, V1, V2, Mask,
12795 Zeroable, Subtarget, DAG))
12798 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v4i64, V1, V2, Mask,
12799 Zeroable, Subtarget, DAG))
12802 // Check for being able to broadcast a single element.
12803 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v4i64, V1, V2,
12804 Mask, Subtarget, DAG))
12807 if (V2.isUndef()) {
12808 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
12809 // can use lower latency instructions that will operate on both lanes.
12810 SmallVector<int, 2> RepeatedMask;
12811 if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
12812 SmallVector<int, 4> PSHUFDMask;
12813 scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
12814 return DAG.getBitcast(
12816 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
12817 DAG.getBitcast(MVT::v8i32, V1),
12818 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
12821 // AVX2 provides a direct instruction for permuting a single input across
12823 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v4i64, V1,
12824 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
12827 // Try to use shift instructions.
12828 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v4i64, V1, V2, Mask,
12829 Zeroable, Subtarget, DAG))
12832 // If we have VLX support, we can use VALIGN or VEXPAND.
12833 if (Subtarget.hasVLX()) {
12834 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i64, V1, V2,
12835 Mask, Subtarget, DAG))
12838 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask,
12839 V1, V2, DAG, Subtarget))
12843 // Try to use PALIGNR.
12844 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v4i64, V1, V2,
12845 Mask, Subtarget, DAG))
12848 // Use dedicated unpack instructions for masks that match their pattern.
12850 lowerVectorShuffleWithUNPCK(DL, MVT::v4i64, Mask, V1, V2, DAG))
12853 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12854 // shuffle. However, if we have AVX2 and either inputs are already in place,
12855 // we will be able to shuffle even across lanes the other input in a single
12856 // instruction so skip this pattern.
12857 if (!isShuffleMaskInputInPlace(0, Mask) &&
12858 !isShuffleMaskInputInPlace(1, Mask))
12859 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12860 DL, MVT::v4i64, V1, V2, Mask, Subtarget, DAG))
12863 // Otherwise fall back on generic blend lowering.
12864 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v4i64, V1, V2,
12868 /// \brief Handle lowering of 8-lane 32-bit floating point shuffles.
12870 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
12871 /// isn't available.
12872 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12873 const APInt &Zeroable,
12874 SDValue V1, SDValue V2,
12875 const X86Subtarget &Subtarget,
12876 SelectionDAG &DAG) {
12877 assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12878 assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
12879 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12881 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
12882 Zeroable, Subtarget, DAG))
12885 // Check for being able to broadcast a single element.
12886 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8f32, V1, V2,
12887 Mask, Subtarget, DAG))
12890 // If the shuffle mask is repeated in each 128-bit lane, we have many more
12891 // options to efficiently lower the shuffle.
12892 SmallVector<int, 4> RepeatedMask;
12893 if (is128BitLaneRepeatedShuffleMask(MVT::v8f32, Mask, RepeatedMask)) {
12894 assert(RepeatedMask.size() == 4 &&
12895 "Repeated masks must be half the mask width!");
12897 // Use even/odd duplicate instructions for masks that match their pattern.
12898 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
12899 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v8f32, V1);
12900 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
12901 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v8f32, V1);
12904 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f32, V1,
12905 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
12907 // Use dedicated unpack instructions for masks that match their pattern.
12909 lowerVectorShuffleWithUNPCK(DL, MVT::v8f32, Mask, V1, V2, DAG))
12912 // Otherwise, fall back to a SHUFPS sequence. Here it is important that we
12913 // have already handled any direct blends.
12914 return lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask, V1, V2, DAG);
12917 // Try to create an in-lane repeating shuffle mask and then shuffle the
12918 // the results into the target lanes.
12919 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
12920 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12923 // If we have a single input shuffle with different shuffle patterns in the
12924 // two 128-bit lanes use the variable mask to VPERMILPS.
12925 if (V2.isUndef()) {
12926 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
12927 if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
12928 return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
12930 if (Subtarget.hasAVX2())
12931 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
12933 // Otherwise, fall back.
12934 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
12938 // Try to simplify this by merging 128-bit lanes to enable a lane-based
12940 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
12941 DL, MVT::v8f32, V1, V2, Mask, Subtarget, DAG))
12943 // If we have VLX support, we can use VEXPAND.
12944 if (Subtarget.hasVLX())
12945 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask,
12946 V1, V2, DAG, Subtarget))
12949 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12950 // since after split we get a more efficient code using vpunpcklwd and
12951 // vpunpckhwd instrs than vblend.
12952 if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
12953 if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
12957 // If we have AVX2 then we always want to lower with a blend because at v8 we
12958 // can fully permute the elements.
12959 if (Subtarget.hasAVX2())
12960 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8f32, V1, V2,
12963 // Otherwise fall back on generic lowering.
12964 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2, Mask, DAG);
12967 /// \brief Handle lowering of 8-lane 32-bit integer shuffles.
12969 /// This routine is only called when we have AVX2 and thus a reasonable
12970 /// instruction set for v8i32 shuffling..
12971 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
12972 const APInt &Zeroable,
12973 SDValue V1, SDValue V2,
12974 const X86Subtarget &Subtarget,
12975 SelectionDAG &DAG) {
12976 assert(V1.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12977 assert(V2.getSimpleValueType() == MVT::v8i32 && "Bad operand type!");
12978 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
12979 assert(Subtarget.hasAVX2() && "We can only lower v8i32 with AVX2!");
12981 // Whenever we can lower this as a zext, that instruction is strictly faster
12982 // than any alternative. It also allows us to fold memory operands into the
12983 // shuffle in many cases.
12984 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
12985 DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
12988 // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
12989 // since after split we get a more efficient code than vblend by using
12990 // vpunpcklwd and vpunpckhwd instrs.
12991 if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
12992 !Subtarget.hasAVX512())
12994 lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
12997 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
12998 Zeroable, Subtarget, DAG))
13001 // Check for being able to broadcast a single element.
13002 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v8i32, V1, V2,
13003 Mask, Subtarget, DAG))
13006 // If the shuffle mask is repeated in each 128-bit lane we can use more
13007 // efficient instructions that mirror the shuffles across the two 128-bit
13009 SmallVector<int, 4> RepeatedMask;
13010 bool Is128BitLaneRepeatedShuffle =
13011 is128BitLaneRepeatedShuffleMask(MVT::v8i32, Mask, RepeatedMask);
13012 if (Is128BitLaneRepeatedShuffle) {
13013 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13015 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, V1,
13016 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13018 // Use dedicated unpack instructions for masks that match their pattern.
13020 lowerVectorShuffleWithUNPCK(DL, MVT::v8i32, Mask, V1, V2, DAG))
13024 // Try to use shift instructions.
13025 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i32, V1, V2, Mask,
13026 Zeroable, Subtarget, DAG))
13029 // If we have VLX support, we can use VALIGN or EXPAND.
13030 if (Subtarget.hasVLX()) {
13031 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i32, V1, V2,
13032 Mask, Subtarget, DAG))
13035 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask,
13036 V1, V2, DAG, Subtarget))
13040 // Try to use byte rotation instructions.
13041 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13042 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13045 // Try to create an in-lane repeating shuffle mask and then shuffle the
13046 // results into the target lanes.
13047 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13048 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13051 // If the shuffle patterns aren't repeated but it is a single input, directly
13052 // generate a cross-lane VPERMD instruction.
13053 if (V2.isUndef()) {
13054 SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
13055 return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
13058 // Assume that a single SHUFPS is faster than an alternative sequence of
13059 // multiple instructions (even if the CPU has a domain penalty).
13060 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13061 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13062 SDValue CastV1 = DAG.getBitcast(MVT::v8f32, V1);
13063 SDValue CastV2 = DAG.getBitcast(MVT::v8f32, V2);
13064 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v8f32, RepeatedMask,
13065 CastV1, CastV2, DAG);
13066 return DAG.getBitcast(MVT::v8i32, ShufPS);
13069 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13071 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13072 DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
13075 // Otherwise fall back on generic blend lowering.
13076 return lowerVectorShuffleAsDecomposedShuffleBlend(DL, MVT::v8i32, V1, V2,
13080 /// \brief Handle lowering of 16-lane 16-bit integer shuffles.
13082 /// This routine is only called when we have AVX2 and thus a reasonable
13083 /// instruction set for v16i16 shuffling..
13084 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13085 const APInt &Zeroable,
13086 SDValue V1, SDValue V2,
13087 const X86Subtarget &Subtarget,
13088 SelectionDAG &DAG) {
13089 assert(V1.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13090 assert(V2.getSimpleValueType() == MVT::v16i16 && "Bad operand type!");
13091 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13092 assert(Subtarget.hasAVX2() && "We can only lower v16i16 with AVX2!");
13094 // Whenever we can lower this as a zext, that instruction is strictly faster
13095 // than any alternative. It also allows us to fold memory operands into the
13096 // shuffle in many cases.
13097 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13098 DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13101 // Check for being able to broadcast a single element.
13102 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v16i16, V1, V2,
13103 Mask, Subtarget, DAG))
13106 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i16, V1, V2, Mask,
13107 Zeroable, Subtarget, DAG))
13110 // Use dedicated unpack instructions for masks that match their pattern.
13112 lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
13115 // Try to use shift instructions.
13116 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
13117 Zeroable, Subtarget, DAG))
13120 // Try to use byte rotation instructions.
13121 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13122 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13125 // Try to create an in-lane repeating shuffle mask and then shuffle the
13126 // the results into the target lanes.
13127 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13128 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13131 if (V2.isUndef()) {
13132 // There are no generalized cross-lane shuffle operations available on i16
13134 if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
13135 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
13138 SmallVector<int, 8> RepeatedMask;
13139 if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
13140 // As this is a single-input shuffle, the repeated mask should be
13141 // a strictly valid v8i16 mask that we can pass through to the v8i16
13142 // lowering to handle even the v16 case.
13143 return lowerV8I16GeneralSingleInputVectorShuffle(
13144 DL, MVT::v16i16, V1, RepeatedMask, Subtarget, DAG);
13148 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13149 DL, MVT::v16i16, Mask, V1, V2, Zeroable, Subtarget, DAG))
13152 // AVX512BWVL can lower to VPERMW.
13153 if (Subtarget.hasBWI() && Subtarget.hasVLX())
13154 return lowerVectorShuffleWithPERMV(DL, MVT::v16i16, Mask, V1, V2, DAG);
13156 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13158 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13159 DL, MVT::v16i16, V1, V2, Mask, Subtarget, DAG))
13162 // Otherwise fall back on generic lowering.
13163 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v16i16, V1, V2, Mask, DAG);
13166 /// \brief Handle lowering of 32-lane 8-bit integer shuffles.
13168 /// This routine is only called when we have AVX2 and thus a reasonable
13169 /// instruction set for v32i8 shuffling..
13170 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13171 const APInt &Zeroable,
13172 SDValue V1, SDValue V2,
13173 const X86Subtarget &Subtarget,
13174 SelectionDAG &DAG) {
13175 assert(V1.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13176 assert(V2.getSimpleValueType() == MVT::v32i8 && "Bad operand type!");
13177 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13178 assert(Subtarget.hasAVX2() && "We can only lower v32i8 with AVX2!");
13180 // Whenever we can lower this as a zext, that instruction is strictly faster
13181 // than any alternative. It also allows us to fold memory operands into the
13182 // shuffle in many cases.
13183 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13184 DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13187 // Check for being able to broadcast a single element.
13188 if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, MVT::v32i8, V1, V2,
13189 Mask, Subtarget, DAG))
13192 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i8, V1, V2, Mask,
13193 Zeroable, Subtarget, DAG))
13196 // Use dedicated unpack instructions for masks that match their pattern.
13198 lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
13201 // Try to use shift instructions.
13202 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
13203 Zeroable, Subtarget, DAG))
13206 // Try to use byte rotation instructions.
13207 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13208 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13211 // Try to create an in-lane repeating shuffle mask and then shuffle the
13212 // the results into the target lanes.
13213 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13214 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13217 // There are no generalized cross-lane shuffle operations available on i8
13219 if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
13220 return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
13223 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13224 DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13227 // Try to simplify this by merging 128-bit lanes to enable a lane-based
13229 if (SDValue Result = lowerVectorShuffleByMerging128BitLanes(
13230 DL, MVT::v32i8, V1, V2, Mask, Subtarget, DAG))
13233 // Otherwise fall back on generic lowering.
13234 return lowerVectorShuffleAsSplitOrBlend(DL, MVT::v32i8, V1, V2, Mask, DAG);
13237 /// \brief High-level routine to lower various 256-bit x86 vector shuffles.
13239 /// This routine either breaks down the specific type of a 256-bit x86 vector
13240 /// shuffle or splits it into two 128-bit shuffles and fuses the results back
13241 /// together based on the available instructions.
13242 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13243 MVT VT, SDValue V1, SDValue V2,
13244 const APInt &Zeroable,
13245 const X86Subtarget &Subtarget,
13246 SelectionDAG &DAG) {
13247 // If we have a single input to the zero element, insert that into V1 if we
13248 // can do so cheaply.
13249 int NumElts = VT.getVectorNumElements();
13250 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13252 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13253 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13254 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13257 // Handle special cases where the lower or upper half is UNDEF.
13259 lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
13262 // There is a really nice hard cut-over between AVX1 and AVX2 that means we
13263 // can check for those subtargets here and avoid much of the subtarget
13264 // querying in the per-vector-type lowering routines. With AVX1 we have
13265 // essentially *zero* ability to manipulate a 256-bit vector with integer
13266 // types. Since we'll use floating point types there eventually, just
13267 // immediately cast everything to a float and operate entirely in that domain.
13268 if (VT.isInteger() && !Subtarget.hasAVX2()) {
13269 int ElementBits = VT.getScalarSizeInBits();
13270 if (ElementBits < 32) {
13271 // No floating point type available, if we can't use the bit operations
13272 // for masking/blending then decompose into 128-bit vectors.
13274 lowerVectorShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable, DAG))
13276 if (SDValue V = lowerVectorShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
13278 return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
13281 MVT FpVT = MVT::getVectorVT(MVT::getFloatingPointVT(ElementBits),
13282 VT.getVectorNumElements());
13283 V1 = DAG.getBitcast(FpVT, V1);
13284 V2 = DAG.getBitcast(FpVT, V2);
13285 return DAG.getBitcast(VT, DAG.getVectorShuffle(FpVT, DL, V1, V2, Mask));
13288 switch (VT.SimpleTy) {
13290 return lowerV4F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13292 return lowerV4I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13294 return lowerV8F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13296 return lowerV8I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13298 return lowerV16I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13300 return lowerV32I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13303 llvm_unreachable("Not a valid 256-bit x86 vector type!");
13307 /// \brief Try to lower a vector shuffle as a 128-bit shuffles.
13308 static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
13309 ArrayRef<int> Mask, SDValue V1,
13310 SDValue V2, SelectionDAG &DAG) {
13311 assert(VT.getScalarSizeInBits() == 64 &&
13312 "Unexpected element type size for 128bit shuffle.");
13314 // To handle 256 bit vector requires VLX and most probably
13315 // function lowerV2X128VectorShuffle() is better solution.
13316 assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
13318 SmallVector<int, 4> WidenedMask;
13319 if (!canWidenShuffleElements(Mask, WidenedMask))
13322 // Check for patterns which can be matched with a single insert of a 256-bit
13324 bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
13325 {0, 1, 2, 3, 0, 1, 2, 3});
13326 if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
13327 {0, 1, 2, 3, 8, 9, 10, 11})) {
13328 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
13329 SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
13330 DAG.getIntPtrConstant(0, DL));
13331 SDValue HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
13332 OnlyUsesV1 ? V1 : V2,
13333 DAG.getIntPtrConstant(0, DL));
13334 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
13337 assert(WidenedMask.size() == 4);
13339 // See if this is an insertion of the lower 128-bits of V2 into V1.
13340 bool IsInsert = true;
13342 for (int i = 0; i < 4; ++i) {
13343 assert(WidenedMask[i] >= -1);
13344 if (WidenedMask[i] < 0)
13347 // Make sure all V1 subvectors are in place.
13348 if (WidenedMask[i] < 4) {
13349 if (WidenedMask[i] != i) {
13354 // Make sure we only have a single V2 index and its the lowest 128-bits.
13355 if (V2Index >= 0 || WidenedMask[i] != 4) {
13362 if (IsInsert && V2Index >= 0) {
13363 MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 2);
13364 SDValue Subvec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V2,
13365 DAG.getIntPtrConstant(0, DL));
13366 return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
13369 // Try to lower to to vshuf64x2/vshuf32x4.
13370 SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
13371 unsigned PermMask = 0;
13372 // Insure elements came from the same Op.
13373 for (int i = 0; i < 4; ++i) {
13374 assert(WidenedMask[i] >= -1);
13375 if (WidenedMask[i] < 0)
13378 SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
13379 unsigned OpIndex = i / 2;
13380 if (Ops[OpIndex].isUndef())
13382 else if (Ops[OpIndex] != Op)
13385 // Convert the 128-bit shuffle mask selection values into 128-bit selection
13386 // bits defined by a vshuf64x2 instruction's immediate control byte.
13387 PermMask |= (WidenedMask[i] % 4) << (i * 2);
13390 return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
13391 DAG.getConstant(PermMask, DL, MVT::i8));
13394 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
13395 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13396 const APInt &Zeroable,
13397 SDValue V1, SDValue V2,
13398 const X86Subtarget &Subtarget,
13399 SelectionDAG &DAG) {
13400 assert(V1.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13401 assert(V2.getSimpleValueType() == MVT::v8f64 && "Bad operand type!");
13402 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13404 if (V2.isUndef()) {
13405 // Use low duplicate instructions for masks that match their pattern.
13406 if (isShuffleEquivalent(V1, V2, Mask, {0, 0, 2, 2, 4, 4, 6, 6}))
13407 return DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v8f64, V1);
13409 if (!is128BitLaneCrossingShuffleMask(MVT::v8f64, Mask)) {
13410 // Non-half-crossing single input shuffles can be lowered with an
13411 // interleaved permutation.
13412 unsigned VPERMILPMask = (Mask[0] == 1) | ((Mask[1] == 1) << 1) |
13413 ((Mask[2] == 3) << 2) | ((Mask[3] == 3) << 3) |
13414 ((Mask[4] == 5) << 4) | ((Mask[5] == 5) << 5) |
13415 ((Mask[6] == 7) << 6) | ((Mask[7] == 7) << 7);
13416 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v8f64, V1,
13417 DAG.getConstant(VPERMILPMask, DL, MVT::i8));
13420 SmallVector<int, 4> RepeatedMask;
13421 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask))
13422 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8f64, V1,
13423 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13426 if (SDValue Shuf128 =
13427 lowerV4X128VectorShuffle(DL, MVT::v8f64, Mask, V1, V2, DAG))
13430 if (SDValue Unpck =
13431 lowerVectorShuffleWithUNPCK(DL, MVT::v8f64, Mask, V1, V2, DAG))
13434 // Check if the blend happens to exactly fit that of SHUFPD.
13436 lowerVectorShuffleWithSHUFPD(DL, MVT::v8f64, Mask, V1, V2, DAG))
13439 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1,
13440 V2, DAG, Subtarget))
13443 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
13444 Zeroable, Subtarget, DAG))
13447 return lowerVectorShuffleWithPERMV(DL, MVT::v8f64, Mask, V1, V2, DAG);
13450 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
13451 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13452 const APInt &Zeroable,
13453 SDValue V1, SDValue V2,
13454 const X86Subtarget &Subtarget,
13455 SelectionDAG &DAG) {
13456 assert(V1.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13457 assert(V2.getSimpleValueType() == MVT::v16f32 && "Bad operand type!");
13458 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13460 // If the shuffle mask is repeated in each 128-bit lane, we have many more
13461 // options to efficiently lower the shuffle.
13462 SmallVector<int, 4> RepeatedMask;
13463 if (is128BitLaneRepeatedShuffleMask(MVT::v16f32, Mask, RepeatedMask)) {
13464 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13466 // Use even/odd duplicate instructions for masks that match their pattern.
13467 if (isShuffleEquivalent(V1, V2, RepeatedMask, {0, 0, 2, 2}))
13468 return DAG.getNode(X86ISD::MOVSLDUP, DL, MVT::v16f32, V1);
13469 if (isShuffleEquivalent(V1, V2, RepeatedMask, {1, 1, 3, 3}))
13470 return DAG.getNode(X86ISD::MOVSHDUP, DL, MVT::v16f32, V1);
13473 return DAG.getNode(X86ISD::VPERMILPI, DL, MVT::v16f32, V1,
13474 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13476 // Use dedicated unpack instructions for masks that match their pattern.
13477 if (SDValue Unpck =
13478 lowerVectorShuffleWithUNPCK(DL, MVT::v16f32, Mask, V1, V2, DAG))
13481 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16f32, V1, V2, Mask,
13482 Zeroable, Subtarget, DAG))
13485 // Otherwise, fall back to a SHUFPS sequence.
13486 return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
13488 // If we have AVX512F support, we can use VEXPAND.
13489 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
13490 V1, V2, DAG, Subtarget))
13493 return lowerVectorShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, DAG);
13496 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
13497 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13498 const APInt &Zeroable,
13499 SDValue V1, SDValue V2,
13500 const X86Subtarget &Subtarget,
13501 SelectionDAG &DAG) {
13502 assert(V1.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13503 assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
13504 assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
13506 if (SDValue Shuf128 =
13507 lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
13510 if (V2.isUndef()) {
13511 // When the shuffle is mirrored between the 128-bit lanes of the unit, we
13512 // can use lower latency instructions that will operate on all four
13514 SmallVector<int, 2> Repeated128Mask;
13515 if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
13516 SmallVector<int, 4> PSHUFDMask;
13517 scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
13518 return DAG.getBitcast(
13520 DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
13521 DAG.getBitcast(MVT::v16i32, V1),
13522 getV4X86ShuffleImm8ForMask(PSHUFDMask, DL, DAG)));
13525 SmallVector<int, 4> Repeated256Mask;
13526 if (is256BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated256Mask))
13527 return DAG.getNode(X86ISD::VPERMI, DL, MVT::v8i64, V1,
13528 getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
13531 // Try to use shift instructions.
13532 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
13533 Zeroable, Subtarget, DAG))
13536 // Try to use VALIGN.
13537 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v8i64, V1, V2,
13538 Mask, Subtarget, DAG))
13541 // Try to use PALIGNR.
13542 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i64, V1, V2,
13543 Mask, Subtarget, DAG))
13546 if (SDValue Unpck =
13547 lowerVectorShuffleWithUNPCK(DL, MVT::v8i64, Mask, V1, V2, DAG))
13549 // If we have AVX512F support, we can use VEXPAND.
13550 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1,
13551 V2, DAG, Subtarget))
13554 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
13555 Zeroable, Subtarget, DAG))
13558 return lowerVectorShuffleWithPERMV(DL, MVT::v8i64, Mask, V1, V2, DAG);
13561 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
13562 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13563 const APInt &Zeroable,
13564 SDValue V1, SDValue V2,
13565 const X86Subtarget &Subtarget,
13566 SelectionDAG &DAG) {
13567 assert(V1.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13568 assert(V2.getSimpleValueType() == MVT::v16i32 && "Bad operand type!");
13569 assert(Mask.size() == 16 && "Unexpected mask size for v16 shuffle!");
13571 // Whenever we can lower this as a zext, that instruction is strictly faster
13572 // than any alternative. It also allows us to fold memory operands into the
13573 // shuffle in many cases.
13574 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13575 DL, MVT::v16i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
13578 // If the shuffle mask is repeated in each 128-bit lane we can use more
13579 // efficient instructions that mirror the shuffles across the four 128-bit
13581 SmallVector<int, 4> RepeatedMask;
13582 bool Is128BitLaneRepeatedShuffle =
13583 is128BitLaneRepeatedShuffleMask(MVT::v16i32, Mask, RepeatedMask);
13584 if (Is128BitLaneRepeatedShuffle) {
13585 assert(RepeatedMask.size() == 4 && "Unexpected repeated mask size!");
13587 return DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, V1,
13588 getV4X86ShuffleImm8ForMask(RepeatedMask, DL, DAG));
13590 // Use dedicated unpack instructions for masks that match their pattern.
13592 lowerVectorShuffleWithUNPCK(DL, MVT::v16i32, Mask, V1, V2, DAG))
13596 // Try to use shift instructions.
13597 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i32, V1, V2, Mask,
13598 Zeroable, Subtarget, DAG))
13601 // Try to use VALIGN.
13602 if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v16i32, V1, V2,
13603 Mask, Subtarget, DAG))
13606 // Try to use byte rotation instructions.
13607 if (Subtarget.hasBWI())
13608 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13609 DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
13612 // Assume that a single SHUFPS is faster than using a permv shuffle.
13613 // If some CPU is harmed by the domain switch, we can fix it in a later pass.
13614 if (Is128BitLaneRepeatedShuffle && isSingleSHUFPSMask(RepeatedMask)) {
13615 SDValue CastV1 = DAG.getBitcast(MVT::v16f32, V1);
13616 SDValue CastV2 = DAG.getBitcast(MVT::v16f32, V2);
13617 SDValue ShufPS = lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask,
13618 CastV1, CastV2, DAG);
13619 return DAG.getBitcast(MVT::v16i32, ShufPS);
13621 // If we have AVX512F support, we can use VEXPAND.
13622 if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask,
13623 V1, V2, DAG, Subtarget))
13626 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
13627 Zeroable, Subtarget, DAG))
13629 return lowerVectorShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
13632 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
13633 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13634 const APInt &Zeroable,
13635 SDValue V1, SDValue V2,
13636 const X86Subtarget &Subtarget,
13637 SelectionDAG &DAG) {
13638 assert(V1.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13639 assert(V2.getSimpleValueType() == MVT::v32i16 && "Bad operand type!");
13640 assert(Mask.size() == 32 && "Unexpected mask size for v32 shuffle!");
13641 assert(Subtarget.hasBWI() && "We can only lower v32i16 with AVX-512-BWI!");
13643 // Whenever we can lower this as a zext, that instruction is strictly faster
13644 // than any alternative. It also allows us to fold memory operands into the
13645 // shuffle in many cases.
13646 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13647 DL, MVT::v32i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
13650 // Use dedicated unpack instructions for masks that match their pattern.
13652 lowerVectorShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
13655 // Try to use shift instructions.
13656 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
13657 Zeroable, Subtarget, DAG))
13660 // Try to use byte rotation instructions.
13661 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13662 DL, MVT::v32i16, V1, V2, Mask, Subtarget, DAG))
13665 if (V2.isUndef()) {
13666 SmallVector<int, 8> RepeatedMask;
13667 if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
13668 // As this is a single-input shuffle, the repeated mask should be
13669 // a strictly valid v8i16 mask that we can pass through to the v8i16
13670 // lowering to handle even the v32 case.
13671 return lowerV8I16GeneralSingleInputVectorShuffle(
13672 DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
13676 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
13677 Zeroable, Subtarget, DAG))
13680 return lowerVectorShuffleWithPERMV(DL, MVT::v32i16, Mask, V1, V2, DAG);
13683 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
13684 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13685 const APInt &Zeroable,
13686 SDValue V1, SDValue V2,
13687 const X86Subtarget &Subtarget,
13688 SelectionDAG &DAG) {
13689 assert(V1.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13690 assert(V2.getSimpleValueType() == MVT::v64i8 && "Bad operand type!");
13691 assert(Mask.size() == 64 && "Unexpected mask size for v64 shuffle!");
13692 assert(Subtarget.hasBWI() && "We can only lower v64i8 with AVX-512-BWI!");
13694 // Whenever we can lower this as a zext, that instruction is strictly faster
13695 // than any alternative. It also allows us to fold memory operands into the
13696 // shuffle in many cases.
13697 if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
13698 DL, MVT::v64i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
13701 // Use dedicated unpack instructions for masks that match their pattern.
13703 lowerVectorShuffleWithUNPCK(DL, MVT::v64i8, Mask, V1, V2, DAG))
13706 // Try to use shift instructions.
13707 if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v64i8, V1, V2, Mask,
13708 Zeroable, Subtarget, DAG))
13711 // Try to use byte rotation instructions.
13712 if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
13713 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13716 if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
13717 DL, MVT::v64i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
13720 // VBMI can use VPERMV/VPERMV3 byte shuffles.
13721 if (Subtarget.hasVBMI())
13722 return lowerVectorShuffleWithPERMV(DL, MVT::v64i8, Mask, V1, V2, DAG);
13724 // Try to create an in-lane repeating shuffle mask and then shuffle the
13725 // the results into the target lanes.
13726 if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
13727 DL, MVT::v64i8, V1, V2, Mask, Subtarget, DAG))
13730 if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v64i8, V1, V2, Mask,
13731 Zeroable, Subtarget, DAG))
13734 // FIXME: Implement direct support for this type!
13735 return splitAndLowerVectorShuffle(DL, MVT::v64i8, V1, V2, Mask, DAG);
13738 /// \brief High-level routine to lower various 512-bit x86 vector shuffles.
13740 /// This routine either breaks down the specific type of a 512-bit x86 vector
13741 /// shuffle or splits it into two 256-bit shuffles and fuses the results back
13742 /// together based on the available instructions.
13743 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13744 MVT VT, SDValue V1, SDValue V2,
13745 const APInt &Zeroable,
13746 const X86Subtarget &Subtarget,
13747 SelectionDAG &DAG) {
13748 assert(Subtarget.hasAVX512() &&
13749 "Cannot lower 512-bit vectors w/ basic ISA!");
13751 // If we have a single input to the zero element, insert that into V1 if we
13752 // can do so cheaply.
13753 int NumElts = Mask.size();
13754 int NumV2Elements = count_if(Mask, [NumElts](int M) { return M >= NumElts; });
13756 if (NumV2Elements == 1 && Mask[0] >= NumElts)
13757 if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
13758 DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
13761 // Check for being able to broadcast a single element.
13762 if (SDValue Broadcast =
13763 lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
13766 // Dispatch to each element type for lowering. If we don't have support for
13767 // specific element type shuffles at 512 bits, immediately split them and
13768 // lower them. Each lowering routine of a given type is allowed to assume that
13769 // the requisite ISA extensions for that element type are available.
13770 switch (VT.SimpleTy) {
13772 return lowerV8F64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13774 return lowerV16F32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13776 return lowerV8I64VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13778 return lowerV16I32VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13780 return lowerV32I16VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13782 return lowerV64I8VectorShuffle(DL, Mask, Zeroable, V1, V2, Subtarget, DAG);
13785 llvm_unreachable("Not a valid 512-bit x86 vector type!");
13789 // Lower vXi1 vector shuffles.
13790 // There is no a dedicated instruction on AVX-512 that shuffles the masks.
13791 // The only way to shuffle bits is to sign-extend the mask vector to SIMD
13792 // vector, shuffle and then truncate it back.
13793 static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
13794 MVT VT, SDValue V1, SDValue V2,
13795 const X86Subtarget &Subtarget,
13796 SelectionDAG &DAG) {
13797 assert(Subtarget.hasAVX512() &&
13798 "Cannot lower 512-bit vectors w/o basic ISA!");
13800 switch (VT.SimpleTy) {
13802 llvm_unreachable("Expected a vector of i1 elements");
13804 ExtVT = MVT::v2i64;
13807 ExtVT = MVT::v4i32;
13810 ExtVT = MVT::v8i64; // Take 512-bit type, more shuffles on KNL
13813 ExtVT = MVT::v16i32;
13816 ExtVT = MVT::v32i16;
13819 ExtVT = MVT::v64i8;
13823 if (ISD::isBuildVectorAllZeros(V1.getNode()))
13824 V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13825 else if (ISD::isBuildVectorAllOnes(V1.getNode()))
13826 V1 = getOnesVector(ExtVT, DAG, DL);
13828 V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
13831 V2 = DAG.getUNDEF(ExtVT);
13832 else if (ISD::isBuildVectorAllZeros(V2.getNode()))
13833 V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
13834 else if (ISD::isBuildVectorAllOnes(V2.getNode()))
13835 V2 = getOnesVector(ExtVT, DAG, DL);
13837 V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
13839 SDValue Shuffle = DAG.getVectorShuffle(ExtVT, DL, V1, V2, Mask);
13840 // i1 was sign extended we can use X86ISD::CVT2MASK.
13841 int NumElems = VT.getVectorNumElements();
13842 if ((Subtarget.hasBWI() && (NumElems >= 32)) ||
13843 (Subtarget.hasDQI() && (NumElems < 32)))
13844 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, Shuffle);
13846 return DAG.getNode(ISD::TRUNCATE, DL, VT, Shuffle);
13849 /// Helper function that returns true if the shuffle mask should be
13850 /// commuted to improve canonicalization.
13851 static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
13852 int NumElements = Mask.size();
13854 int NumV1Elements = 0, NumV2Elements = 0;
13858 else if (M < NumElements)
13863 // Commute the shuffle as needed such that more elements come from V1 than
13864 // V2. This allows us to match the shuffle pattern strictly on how many
13865 // elements come from V1 without handling the symmetric cases.
13866 if (NumV2Elements > NumV1Elements)
13869 assert(NumV1Elements > 0 && "No V1 indices");
13871 if (NumV2Elements == 0)
13874 // When the number of V1 and V2 elements are the same, try to minimize the
13875 // number of uses of V2 in the low half of the vector. When that is tied,
13876 // ensure that the sum of indices for V1 is equal to or lower than the sum
13877 // indices for V2. When those are equal, try to ensure that the number of odd
13878 // indices for V1 is lower than the number of odd indices for V2.
13879 if (NumV1Elements == NumV2Elements) {
13880 int LowV1Elements = 0, LowV2Elements = 0;
13881 for (int M : Mask.slice(0, NumElements / 2))
13882 if (M >= NumElements)
13886 if (LowV2Elements > LowV1Elements)
13888 if (LowV2Elements == LowV1Elements) {
13889 int SumV1Indices = 0, SumV2Indices = 0;
13890 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13891 if (Mask[i] >= NumElements)
13893 else if (Mask[i] >= 0)
13895 if (SumV2Indices < SumV1Indices)
13897 if (SumV2Indices == SumV1Indices) {
13898 int NumV1OddIndices = 0, NumV2OddIndices = 0;
13899 for (int i = 0, Size = Mask.size(); i < Size; ++i)
13900 if (Mask[i] >= NumElements)
13901 NumV2OddIndices += i % 2;
13902 else if (Mask[i] >= 0)
13903 NumV1OddIndices += i % 2;
13904 if (NumV2OddIndices < NumV1OddIndices)
13913 /// \brief Top-level lowering for x86 vector shuffles.
13915 /// This handles decomposition, canonicalization, and lowering of all x86
13916 /// vector shuffles. Most of the specific lowering strategies are encapsulated
13917 /// above in helper routines. The canonicalization attempts to widen shuffles
13918 /// to involve fewer lanes of wider elements, consolidate symmetric patterns
13919 /// s.t. only one of the two inputs needs to be tested, etc.
13920 static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
13921 SelectionDAG &DAG) {
13922 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
13923 ArrayRef<int> Mask = SVOp->getMask();
13924 SDValue V1 = Op.getOperand(0);
13925 SDValue V2 = Op.getOperand(1);
13926 MVT VT = Op.getSimpleValueType();
13927 int NumElements = VT.getVectorNumElements();
13929 bool Is1BitVector = (VT.getVectorElementType() == MVT::i1);
13931 assert((VT.getSizeInBits() != 64 || Is1BitVector) &&
13932 "Can't lower MMX shuffles");
13934 bool V1IsUndef = V1.isUndef();
13935 bool V2IsUndef = V2.isUndef();
13936 if (V1IsUndef && V2IsUndef)
13937 return DAG.getUNDEF(VT);
13939 // When we create a shuffle node we put the UNDEF node to second operand,
13940 // but in some cases the first operand may be transformed to UNDEF.
13941 // In this case we should just commute the node.
13943 return DAG.getCommutedVectorShuffle(*SVOp);
13945 // Check for non-undef masks pointing at an undef vector and make the masks
13946 // undef as well. This makes it easier to match the shuffle based solely on
13950 if (M >= NumElements) {
13951 SmallVector<int, 8> NewMask(Mask.begin(), Mask.end());
13952 for (int &M : NewMask)
13953 if (M >= NumElements)
13955 return DAG.getVectorShuffle(VT, DL, V1, V2, NewMask);
13958 // Check for illegal shuffle mask element index values.
13959 int MaskUpperLimit = Mask.size() * (V2IsUndef ? 1 : 2); (void)MaskUpperLimit;
13960 assert(llvm::all_of(Mask,
13961 [&](int M) { return -1 <= M && M < MaskUpperLimit; }) &&
13962 "Out of bounds shuffle index");
13964 // We actually see shuffles that are entirely re-arrangements of a set of
13965 // zero inputs. This mostly happens while decomposing complex shuffles into
13966 // simple ones. Directly lower these as a buildvector of zeros.
13967 APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
13968 if (Zeroable.isAllOnesValue())
13969 return getZeroVector(VT, Subtarget, DAG, DL);
13971 // Try to collapse shuffles into using a vector type with fewer elements but
13972 // wider element types. We cap this to not form integers or floating point
13973 // elements wider than 64 bits, but it might be interesting to form i128
13974 // integers to handle flipping the low and high halves of AVX 256-bit vectors.
13975 SmallVector<int, 16> WidenedMask;
13976 if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
13977 canWidenShuffleElements(Mask, WidenedMask)) {
13978 MVT NewEltVT = VT.isFloatingPoint()
13979 ? MVT::getFloatingPointVT(VT.getScalarSizeInBits() * 2)
13980 : MVT::getIntegerVT(VT.getScalarSizeInBits() * 2);
13981 MVT NewVT = MVT::getVectorVT(NewEltVT, VT.getVectorNumElements() / 2);
13982 // Make sure that the new vector type is legal. For example, v2f64 isn't
13984 if (DAG.getTargetLoweringInfo().isTypeLegal(NewVT)) {
13985 V1 = DAG.getBitcast(NewVT, V1);
13986 V2 = DAG.getBitcast(NewVT, V2);
13987 return DAG.getBitcast(
13988 VT, DAG.getVectorShuffle(NewVT, DL, V1, V2, WidenedMask));
13992 // Commute the shuffle if it will improve canonicalization.
13993 if (canonicalizeShuffleMaskWithCommute(Mask))
13994 return DAG.getCommutedVectorShuffle(*SVOp);
13996 // For each vector width, delegate to a specialized lowering routine.
13997 if (VT.is128BitVector())
13998 return lower128BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14001 if (VT.is256BitVector())
14002 return lower256BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14005 if (VT.is512BitVector())
14006 return lower512BitVectorShuffle(DL, Mask, VT, V1, V2, Zeroable, Subtarget,
14010 return lower1BitVectorShuffle(DL, Mask, VT, V1, V2, Subtarget, DAG);
14012 llvm_unreachable("Unimplemented!");
14015 /// \brief Try to lower a VSELECT instruction to a vector shuffle.
14016 static SDValue lowerVSELECTtoVectorShuffle(SDValue Op,
14017 const X86Subtarget &Subtarget,
14018 SelectionDAG &DAG) {
14019 SDValue Cond = Op.getOperand(0);
14020 SDValue LHS = Op.getOperand(1);
14021 SDValue RHS = Op.getOperand(2);
14023 MVT VT = Op.getSimpleValueType();
14025 if (!ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()))
14027 auto *CondBV = cast<BuildVectorSDNode>(Cond);
14029 // Only non-legal VSELECTs reach this lowering, convert those into generic
14030 // shuffles and re-use the shuffle lowering path for blends.
14031 SmallVector<int, 32> Mask;
14032 for (int i = 0, Size = VT.getVectorNumElements(); i < Size; ++i) {
14033 SDValue CondElt = CondBV->getOperand(i);
14035 isa<ConstantSDNode>(CondElt) ? i + (isNullConstant(CondElt) ? Size : 0)
14038 return DAG.getVectorShuffle(VT, dl, LHS, RHS, Mask);
14041 SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
14042 // A vselect where all conditions and data are constants can be optimized into
14043 // a single vector load by SelectionDAGLegalize::ExpandBUILD_VECTOR().
14044 if (ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(0).getNode()) &&
14045 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(1).getNode()) &&
14046 ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
14049 // If this VSELECT has a vector if i1 as a mask, it will be directly matched
14050 // with patterns on the mask registers on AVX-512.
14051 if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
14054 // Try to lower this to a blend-style vector shuffle. This can handle all
14055 // constant condition cases.
14056 if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
14059 // Variable blends are only legal from SSE4.1 onward.
14060 if (!Subtarget.hasSSE41())
14064 MVT VT = Op.getSimpleValueType();
14066 // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
14067 // into an i1 condition so that we can use the mask-based 512-bit blend
14069 if (VT.getSizeInBits() == 512) {
14070 SDValue Cond = Op.getOperand(0);
14071 // The vNi1 condition case should be handled above as it can be trivially
14073 assert(Cond.getValueType().getScalarSizeInBits() ==
14074 VT.getScalarSizeInBits() &&
14075 "Should have a size-matched integer condition!");
14076 // Build a mask by testing the condition against itself (tests for zero).
14077 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
14078 SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond);
14079 // Now return a new VSELECT using the mask.
14080 return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2));
14083 // Only some types will be legal on some subtargets. If we can emit a legal
14084 // VSELECT-matching blend, return Op, and but if we need to expand, return
14086 switch (VT.SimpleTy) {
14088 // Most of the vector types have blends past SSE4.1.
14092 // The byte blends for AVX vectors were introduced only in AVX2.
14093 if (Subtarget.hasAVX2())
14100 // AVX-512 BWI and VLX features support VSELECT with i16 elements.
14101 if (Subtarget.hasBWI() && Subtarget.hasVLX())
14104 // FIXME: We should custom lower this by fixing the condition and using i8
14110 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
14111 MVT VT = Op.getSimpleValueType();
14114 if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
14117 if (VT.getSizeInBits() == 8) {
14118 SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
14119 Op.getOperand(0), Op.getOperand(1));
14120 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14121 DAG.getValueType(VT));
14122 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14125 if (VT == MVT::f32) {
14126 // EXTRACTPS outputs to a GPR32 register which will require a movd to copy
14127 // the result back to FR32 register. It's only worth matching if the
14128 // result has a single use which is a store or a bitcast to i32. And in
14129 // the case of a store, it's not worth it if the index is a constant 0,
14130 // because a MOVSSmr can be used instead, which is smaller and faster.
14131 if (!Op.hasOneUse())
14133 SDNode *User = *Op.getNode()->use_begin();
14134 if ((User->getOpcode() != ISD::STORE ||
14135 isNullConstant(Op.getOperand(1))) &&
14136 (User->getOpcode() != ISD::BITCAST ||
14137 User->getValueType(0) != MVT::i32))
14139 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14140 DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
14142 return DAG.getBitcast(MVT::f32, Extract);
14145 if (VT == MVT::i32 || VT == MVT::i64) {
14146 // ExtractPS/pextrq works with constant index.
14147 if (isa<ConstantSDNode>(Op.getOperand(1)))
14154 /// Extract one bit from mask vector, like v16i1 or v8i1.
14155 /// AVX-512 feature.
14157 X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
14158 SDValue Vec = Op.getOperand(0);
14160 MVT VecVT = Vec.getSimpleValueType();
14161 SDValue Idx = Op.getOperand(1);
14162 MVT EltVT = Op.getSimpleValueType();
14164 assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
14165 "Unexpected vector type in ExtractBitFromMaskVector");
14167 // variable index can't be handled in mask registers,
14168 // extend vector to VR512/128
14169 if (!isa<ConstantSDNode>(Idx)) {
14170 unsigned NumElts = VecVT.getVectorNumElements();
14171 // Extending v8i1/v16i1 to 512-bit get better performance on KNL
14172 // than extending to 128/256bit.
14173 unsigned VecSize = (NumElts <= 4 ? 128 : 512);
14174 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
14175 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
14176 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
14177 ExtVT.getVectorElementType(), Ext, Idx);
14178 return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
14181 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14182 if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
14183 (VecVT.getVectorNumElements() < 8)) {
14184 // Use kshiftlw/rw instruction.
14185 VecVT = MVT::v16i1;
14186 Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
14187 DAG.getUNDEF(VecVT),
14189 DAG.getIntPtrConstant(0, dl));
14191 unsigned MaxSift = VecVT.getVectorNumElements() - 1;
14192 if (MaxSift - IdxVal)
14193 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14194 DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
14195 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14196 DAG.getConstant(MaxSift, dl, MVT::i8));
14197 return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
14198 DAG.getIntPtrConstant(0, dl));
14202 X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
14203 SelectionDAG &DAG) const {
14205 SDValue Vec = Op.getOperand(0);
14206 MVT VecVT = Vec.getSimpleValueType();
14207 SDValue Idx = Op.getOperand(1);
14209 if (VecVT.getVectorElementType() == MVT::i1)
14210 return ExtractBitFromMaskVector(Op, DAG);
14212 if (!isa<ConstantSDNode>(Idx)) {
14213 // Its more profitable to go through memory (1 cycles throughput)
14214 // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
14215 // IACA tool was used to get performance estimation
14216 // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
14218 // example : extractelement <16 x i8> %a, i32 %i
14220 // Block Throughput: 3.00 Cycles
14221 // Throughput Bottleneck: Port5
14223 // | Num Of | Ports pressure in cycles | |
14224 // | Uops | 0 - DV | 5 | 6 | 7 | |
14225 // ---------------------------------------------
14226 // | 1 | | 1.0 | | | CP | vmovd xmm1, edi
14227 // | 1 | | 1.0 | | | CP | vpshufb xmm0, xmm0, xmm1
14228 // | 2 | 1.0 | 1.0 | | | CP | vpextrb eax, xmm0, 0x0
14229 // Total Num Of Uops: 4
14232 // Block Throughput: 1.00 Cycles
14233 // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
14235 // | | Ports pressure in cycles | |
14236 // |Uops| 1 | 2 - D |3 - D | 4 | 5 | |
14237 // ---------------------------------------------------------
14238 // |2^ | | 0.5 | 0.5 |1.0| |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
14239 // |1 |0.5| | | |0.5| | lea rax, ptr [rsp-0x18]
14240 // |1 | |0.5, 0.5|0.5, 0.5| | |CP| mov al, byte ptr [rdi+rax*1]
14241 // Total Num Of Uops: 4
14246 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14248 // If this is a 256-bit vector result, first extract the 128-bit vector and
14249 // then extract the element from the 128-bit vector.
14250 if (VecVT.is256BitVector() || VecVT.is512BitVector()) {
14251 // Get the 128-bit vector.
14252 Vec = extract128BitVector(Vec, IdxVal, DAG, dl);
14253 MVT EltVT = VecVT.getVectorElementType();
14255 unsigned ElemsPerChunk = 128 / EltVT.getSizeInBits();
14256 assert(isPowerOf2_32(ElemsPerChunk) && "Elements per chunk not power of 2");
14258 // Find IdxVal modulo ElemsPerChunk. Since ElemsPerChunk is a power of 2
14259 // this can be done with a mask.
14260 IdxVal &= ElemsPerChunk - 1;
14261 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Vec,
14262 DAG.getConstant(IdxVal, dl, MVT::i32));
14265 assert(VecVT.is128BitVector() && "Unexpected vector length");
14267 MVT VT = Op.getSimpleValueType();
14269 if (VT.getSizeInBits() == 16) {
14270 // If IdxVal is 0, it's cheaper to do a move instead of a pextrw, unless
14271 // we're going to zero extend the register or fold the store (SSE41 only).
14272 if (IdxVal == 0 && !MayFoldIntoZeroExtend(Op) &&
14273 !(Subtarget.hasSSE41() && MayFoldIntoStore(Op)))
14274 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i16,
14275 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14276 DAG.getBitcast(MVT::v4i32, Vec), Idx));
14278 // Transform it so it match pextrw which produces a 32-bit result.
14279 SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
14280 Op.getOperand(0), Op.getOperand(1));
14281 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
14282 DAG.getValueType(VT));
14283 return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
14286 if (Subtarget.hasSSE41())
14287 if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
14290 // TODO: We only extract a single element from v16i8, we can probably afford
14291 // to be more aggressive here before using the default approach of spilling to
14293 if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
14294 // Extract either the lowest i32 or any i16, and extract the sub-byte.
14295 int DWordIdx = IdxVal / 4;
14296 if (DWordIdx == 0) {
14297 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
14298 DAG.getBitcast(MVT::v4i32, Vec),
14299 DAG.getIntPtrConstant(DWordIdx, dl));
14300 int ShiftVal = (IdxVal % 4) * 8;
14302 Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
14303 DAG.getConstant(ShiftVal, dl, MVT::i32));
14304 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14307 int WordIdx = IdxVal / 2;
14308 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
14309 DAG.getBitcast(MVT::v8i16, Vec),
14310 DAG.getIntPtrConstant(WordIdx, dl));
14311 int ShiftVal = (IdxVal % 2) * 8;
14313 Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
14314 DAG.getConstant(ShiftVal, dl, MVT::i16));
14315 return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
14318 if (VT.getSizeInBits() == 32) {
14322 // SHUFPS the element to the lowest double word, then movss.
14323 int Mask[4] = { static_cast<int>(IdxVal), -1, -1, -1 };
14324 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14325 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14326 DAG.getIntPtrConstant(0, dl));
14329 if (VT.getSizeInBits() == 64) {
14330 // FIXME: .td only matches this for <2 x f64>, not <2 x i64> on 32b
14331 // FIXME: seems like this should be unnecessary if mov{h,l}pd were taught
14332 // to match extract_elt for f64.
14336 // UNPCKHPD the element to the lowest double word, then movsd.
14337 // Note if the lower 64 bits of the result of the UNPCKHPD is then stored
14338 // to a f64mem, the whole operation is folded into a single MOVHPDmr.
14339 int Mask[2] = { 1, -1 };
14340 Vec = DAG.getVectorShuffle(VecVT, dl, Vec, DAG.getUNDEF(VecVT), Mask);
14341 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Vec,
14342 DAG.getIntPtrConstant(0, dl));
14348 /// Insert one bit to mask vector, like v16i1 or v8i1.
14349 /// AVX-512 feature.
14351 X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
14353 SDValue Vec = Op.getOperand(0);
14354 SDValue Elt = Op.getOperand(1);
14355 SDValue Idx = Op.getOperand(2);
14356 MVT VecVT = Vec.getSimpleValueType();
14358 if (!isa<ConstantSDNode>(Idx)) {
14359 // Non constant index. Extend source and destination,
14360 // insert element and then truncate the result.
14361 MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
14362 MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
14363 SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
14364 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
14365 DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
14366 return DAG.getNode(ISD::TRUNCATE, dl, VecVT, ExtOp);
14369 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14370 SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
14371 unsigned NumElems = VecVT.getVectorNumElements();
14373 if(Vec.isUndef()) {
14375 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14376 DAG.getConstant(IdxVal, dl, MVT::i8));
14380 // Insertion of one bit into first position
14381 if (IdxVal == 0 ) {
14382 // Clean top bits of vector.
14383 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14384 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14385 EltInVec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, EltInVec,
14386 DAG.getConstant(NumElems - 1, dl, MVT::i8));
14387 // Clean the first bit in source vector.
14388 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14389 DAG.getConstant(1 , dl, MVT::i8));
14390 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14391 DAG.getConstant(1, dl, MVT::i8));
14393 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14395 // Insertion of one bit into last position
14396 if (IdxVal == NumElems -1) {
14397 // Move the bit to the last position inside the vector.
14398 EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
14399 DAG.getConstant(IdxVal, dl, MVT::i8));
14400 // Clean the last bit in the source vector.
14401 Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
14402 DAG.getConstant(1, dl, MVT::i8));
14403 Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
14404 DAG.getConstant(1 , dl, MVT::i8));
14406 return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
14409 // Use shuffle to insert element.
14410 SmallVector<int, 64> MaskVec(NumElems);
14411 for (unsigned i = 0; i != NumElems; ++i)
14412 MaskVec[i] = (i == IdxVal) ? NumElems : i;
14414 return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
14417 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
14418 SelectionDAG &DAG) const {
14419 MVT VT = Op.getSimpleValueType();
14420 MVT EltVT = VT.getVectorElementType();
14421 unsigned NumElts = VT.getVectorNumElements();
14423 if (EltVT == MVT::i1)
14424 return InsertBitToMaskVector(Op, DAG);
14427 SDValue N0 = Op.getOperand(0);
14428 SDValue N1 = Op.getOperand(1);
14429 SDValue N2 = Op.getOperand(2);
14430 if (!isa<ConstantSDNode>(N2))
14432 auto *N2C = cast<ConstantSDNode>(N2);
14433 unsigned IdxVal = N2C->getZExtValue();
14435 bool IsZeroElt = X86::isZeroNode(N1);
14436 bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
14438 // If we are inserting a element, see if we can do this more efficiently with
14439 // a blend shuffle with a rematerializable vector than a costly integer
14441 if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() &&
14442 16 <= EltVT.getSizeInBits()) {
14443 SmallVector<int, 8> BlendMask;
14444 for (unsigned i = 0; i != NumElts; ++i)
14445 BlendMask.push_back(i == IdxVal ? i + NumElts : i);
14446 SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
14447 : DAG.getConstant(-1, dl, VT);
14448 return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
14451 // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
14452 // into that, and then insert the subvector back into the result.
14453 if (VT.is256BitVector() || VT.is512BitVector()) {
14454 // With a 256-bit vector, we can insert into the zero element efficiently
14455 // using a blend if we have AVX or AVX2 and the right data type.
14456 if (VT.is256BitVector() && IdxVal == 0) {
14457 // TODO: It is worthwhile to cast integer to floating point and back
14458 // and incur a domain crossing penalty if that's what we'll end up
14459 // doing anyway after extracting to a 128-bit vector.
14460 if ((Subtarget.hasAVX() && (EltVT == MVT::f64 || EltVT == MVT::f32)) ||
14461 (Subtarget.hasAVX2() && EltVT == MVT::i32)) {
14462 SDValue N1Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
14463 N2 = DAG.getIntPtrConstant(1, dl);
14464 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1Vec, N2);
14468 // Get the desired 128-bit vector chunk.
14469 SDValue V = extract128BitVector(N0, IdxVal, DAG, dl);
14471 // Insert the element into the desired chunk.
14472 unsigned NumEltsIn128 = 128 / EltVT.getSizeInBits();
14473 assert(isPowerOf2_32(NumEltsIn128));
14474 // Since NumEltsIn128 is a power of 2 we can use mask instead of modulo.
14475 unsigned IdxIn128 = IdxVal & (NumEltsIn128 - 1);
14477 V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, V.getValueType(), V, N1,
14478 DAG.getConstant(IdxIn128, dl, MVT::i32));
14480 // Insert the changed part back into the bigger vector
14481 return insert128BitVector(N0, V, IdxVal, DAG, dl);
14483 assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
14485 // Transform it so it match pinsr{b,w} which expects a GR32 as its second
14486 // argument. SSE41 required for pinsrb.
14487 if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
14489 if (VT == MVT::v8i16) {
14490 assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
14491 Opc = X86ISD::PINSRW;
14493 assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
14494 assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
14495 Opc = X86ISD::PINSRB;
14498 if (N1.getValueType() != MVT::i32)
14499 N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
14500 if (N2.getValueType() != MVT::i32)
14501 N2 = DAG.getIntPtrConstant(IdxVal, dl);
14502 return DAG.getNode(Opc, dl, VT, N0, N1, N2);
14505 if (Subtarget.hasSSE41()) {
14506 if (EltVT == MVT::f32) {
14507 // Bits [7:6] of the constant are the source select. This will always be
14508 // zero here. The DAG Combiner may combine an extract_elt index into
14509 // these bits. For example (insert (extract, 3), 2) could be matched by
14510 // putting the '3' into bits [7:6] of X86ISD::INSERTPS.
14511 // Bits [5:4] of the constant are the destination select. This is the
14512 // value of the incoming immediate.
14513 // Bits [3:0] of the constant are the zero mask. The DAG Combiner may
14514 // combine either bitwise AND or insert of float 0.0 to set these bits.
14516 bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
14517 if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
14518 // If this is an insertion of 32-bits into the low 32-bits of
14519 // a vector, we prefer to generate a blend with immediate rather
14520 // than an insertps. Blends are simpler operations in hardware and so
14521 // will always have equal or better performance than insertps.
14522 // But if optimizing for size and there's a load folding opportunity,
14523 // generate insertps because blendps does not have a 32-bit memory
14525 N2 = DAG.getIntPtrConstant(1, dl);
14526 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14527 return DAG.getNode(X86ISD::BLENDI, dl, VT, N0, N1, N2);
14529 N2 = DAG.getIntPtrConstant(IdxVal << 4, dl);
14530 // Create this as a scalar to vector..
14531 N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, N1);
14532 return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
14535 // PINSR* works with constant index.
14536 if (EltVT == MVT::i32 || EltVT == MVT::i64)
14543 static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
14544 SelectionDAG &DAG) {
14546 MVT OpVT = Op.getSimpleValueType();
14548 // It's always cheaper to replace a xor+movd with xorps and simplifies further
14550 if (X86::isZeroNode(Op.getOperand(0)))
14551 return getZeroVector(OpVT, Subtarget, DAG, dl);
14553 // If this is a 256-bit vector result, first insert into a 128-bit
14554 // vector and then insert into the 256-bit vector.
14555 if (!OpVT.is128BitVector()) {
14556 // Insert into a 128-bit vector.
14557 unsigned SizeFactor = OpVT.getSizeInBits() / 128;
14558 MVT VT128 = MVT::getVectorVT(OpVT.getVectorElementType(),
14559 OpVT.getVectorNumElements() / SizeFactor);
14561 Op = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT128, Op.getOperand(0));
14563 // Insert the 128-bit vector.
14564 return insert128BitVector(DAG.getUNDEF(OpVT), Op, 0, DAG, dl);
14566 assert(OpVT.is128BitVector() && "Expected an SSE type!");
14568 // Pass through a v4i32 SCALAR_TO_VECTOR as that's what we use in tblgen.
14569 if (OpVT == MVT::v4i32)
14572 SDValue AnyExt = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Op.getOperand(0));
14573 return DAG.getBitcast(
14574 OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
14577 // Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
14578 // a simple subregister reference or explicit instructions to grab
14579 // upper bits of a vector.
14580 static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14581 SelectionDAG &DAG) {
14582 assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
14585 SDValue In = Op.getOperand(0);
14586 SDValue Idx = Op.getOperand(1);
14587 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
14588 MVT ResVT = Op.getSimpleValueType();
14590 // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
14591 // would result with: v1i1 = extract_subvector(vXi1, idx).
14592 // Lower these into extract_vector_elt which is already selectable.
14593 if (ResVT == MVT::v1i1) {
14594 assert(Subtarget.hasAVX512() &&
14595 "Boolean EXTRACT_SUBVECTOR requires AVX512");
14597 MVT EltVT = ResVT.getVectorElementType();
14598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
14600 (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
14601 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
14602 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
14605 assert((In.getSimpleValueType().is256BitVector() ||
14606 In.getSimpleValueType().is512BitVector()) &&
14607 "Can only extract from 256-bit or 512-bit vectors");
14609 // If the input is a buildvector just emit a smaller one.
14610 unsigned ElemsPerChunk = ResVT.getVectorNumElements();
14611 if (In.getOpcode() == ISD::BUILD_VECTOR)
14612 return DAG.getBuildVector(
14613 ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
14615 // Everything else is legal.
14619 // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
14620 // simple superregister reference or explicit instructions to insert
14621 // the upper bits of a vector.
14622 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
14623 SelectionDAG &DAG) {
14624 assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
14626 return insert1BitVector(Op, DAG, Subtarget);
14629 // Returns the appropriate wrapper opcode for a global reference.
14630 unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
14631 // References to absolute symbols are never PC-relative.
14632 if (GV && GV->isAbsoluteSymbolRef())
14633 return X86ISD::Wrapper;
14635 CodeModel::Model M = getTargetMachine().getCodeModel();
14636 if (Subtarget.isPICStyleRIPRel() &&
14637 (M == CodeModel::Small || M == CodeModel::Kernel))
14638 return X86ISD::WrapperRIP;
14640 return X86ISD::Wrapper;
14643 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
14644 // their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
14645 // one of the above mentioned nodes. It has to be wrapped because otherwise
14646 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
14647 // be used to form addressing mode. These wrapped nodes will be selected
14650 X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
14651 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
14653 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14654 // global base reg.
14655 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14657 auto PtrVT = getPointerTy(DAG.getDataLayout());
14658 SDValue Result = DAG.getTargetConstantPool(
14659 CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
14661 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14662 // With PIC, the address is actually $g + Offset.
14665 DAG.getNode(ISD::ADD, DL, PtrVT,
14666 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14672 SDValue X86TargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
14673 JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
14675 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14676 // global base reg.
14677 unsigned char OpFlag = Subtarget.classifyLocalReference(nullptr);
14679 auto PtrVT = getPointerTy(DAG.getDataLayout());
14680 SDValue Result = DAG.getTargetJumpTable(JT->getIndex(), PtrVT, OpFlag);
14682 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14684 // With PIC, the address is actually $g + Offset.
14687 DAG.getNode(ISD::ADD, DL, PtrVT,
14688 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14694 X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
14695 const char *Sym = cast<ExternalSymbolSDNode>(Op)->getSymbol();
14697 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14698 // global base reg.
14699 const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
14700 unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
14702 auto PtrVT = getPointerTy(DAG.getDataLayout());
14703 SDValue Result = DAG.getTargetExternalSymbol(Sym, PtrVT, OpFlag);
14706 Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
14708 // With PIC, the address is actually $g + Offset.
14709 if (isPositionIndependent() && !Subtarget.is64Bit()) {
14711 DAG.getNode(ISD::ADD, DL, PtrVT,
14712 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), Result);
14715 // For symbols that require a load from a stub to get the address, emit the
14717 if (isGlobalStubReference(OpFlag))
14718 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
14719 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14725 X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
14726 // Create the TargetBlockAddressAddress node.
14727 unsigned char OpFlags =
14728 Subtarget.classifyBlockAddressReference();
14729 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
14730 int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
14732 auto PtrVT = getPointerTy(DAG.getDataLayout());
14733 SDValue Result = DAG.getTargetBlockAddress(BA, PtrVT, Offset, OpFlags);
14734 Result = DAG.getNode(getGlobalWrapperKind(), dl, PtrVT, Result);
14736 // With PIC, the address is actually $g + Offset.
14737 if (isGlobalRelativeToPICBase(OpFlags)) {
14738 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14739 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14745 SDValue X86TargetLowering::LowerGlobalAddress(const GlobalValue *GV,
14746 const SDLoc &dl, int64_t Offset,
14747 SelectionDAG &DAG) const {
14748 // Create the TargetGlobalAddress node, folding in the constant
14749 // offset if it is legal.
14750 unsigned char OpFlags = Subtarget.classifyGlobalReference(GV);
14751 CodeModel::Model M = DAG.getTarget().getCodeModel();
14752 auto PtrVT = getPointerTy(DAG.getDataLayout());
14754 if (OpFlags == X86II::MO_NO_FLAG &&
14755 X86::isOffsetSuitableForCodeModel(Offset, M)) {
14756 // A direct static reference to a global.
14757 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, Offset);
14760 Result = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, OpFlags);
14763 Result = DAG.getNode(getGlobalWrapperKind(GV), dl, PtrVT, Result);
14765 // With PIC, the address is actually $g + Offset.
14766 if (isGlobalRelativeToPICBase(OpFlags)) {
14767 Result = DAG.getNode(ISD::ADD, dl, PtrVT,
14768 DAG.getNode(X86ISD::GlobalBaseReg, dl, PtrVT), Result);
14771 // For globals that require a load from a stub to get the address, emit the
14773 if (isGlobalStubReference(OpFlags))
14774 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
14775 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14777 // If there was a non-zero offset that we didn't fold, create an explicit
14778 // addition for it.
14780 Result = DAG.getNode(ISD::ADD, dl, PtrVT, Result,
14781 DAG.getConstant(Offset, dl, PtrVT));
14787 X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
14788 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
14789 int64_t Offset = cast<GlobalAddressSDNode>(Op)->getOffset();
14790 return LowerGlobalAddress(GV, SDLoc(Op), Offset, DAG);
14794 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
14795 SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
14796 unsigned char OperandFlags, bool LocalDynamic = false) {
14797 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
14798 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14800 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14801 GA->getValueType(0),
14805 X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
14809 SDValue Ops[] = { Chain, TGA, *InFlag };
14810 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14812 SDValue Ops[] = { Chain, TGA };
14813 Chain = DAG.getNode(CallType, dl, NodeTys, Ops);
14816 // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
14817 MFI.setAdjustsStack(true);
14818 MFI.setHasCalls(true);
14820 SDValue Flag = Chain.getValue(1);
14821 return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
14824 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 32 bit
14826 LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14829 SDLoc dl(GA); // ? function entry point might be better
14830 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14831 DAG.getNode(X86ISD::GlobalBaseReg,
14832 SDLoc(), PtrVT), InFlag);
14833 InFlag = Chain.getValue(1);
14835 return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
14838 // Lower ISD::GlobalTLSAddress using the "general dynamic" model, 64 bit
14840 LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14842 return GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT,
14843 X86::RAX, X86II::MO_TLSGD);
14846 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
14852 // Get the start address of the TLS block for this module.
14853 X86MachineFunctionInfo *MFI = DAG.getMachineFunction()
14854 .getInfo<X86MachineFunctionInfo>();
14855 MFI->incNumLocalDynamicTLSAccesses();
14859 Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, nullptr, PtrVT, X86::RAX,
14860 X86II::MO_TLSLD, /*LocalDynamic=*/true);
14863 SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
14864 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT), InFlag);
14865 InFlag = Chain.getValue(1);
14866 Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
14867 X86II::MO_TLSLDM, /*LocalDynamic=*/true);
14870 // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
14874 unsigned char OperandFlags = X86II::MO_DTPOFF;
14875 unsigned WrapperKind = X86ISD::Wrapper;
14876 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
14877 GA->getValueType(0),
14878 GA->getOffset(), OperandFlags);
14879 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14881 // Add x@dtpoff with the base.
14882 return DAG.getNode(ISD::ADD, dl, PtrVT, Offset, Base);
14885 // Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
14886 static SDValue LowerToTLSExecModel(GlobalAddressSDNode *GA, SelectionDAG &DAG,
14887 const EVT PtrVT, TLSModel::Model model,
14888 bool is64Bit, bool isPIC) {
14891 // Get the Thread Pointer, which is %gs:0 (32-bit) or %fs:0 (64-bit).
14892 Value *Ptr = Constant::getNullValue(Type::getInt8PtrTy(*DAG.getContext(),
14893 is64Bit ? 257 : 256));
14895 SDValue ThreadPointer =
14896 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), DAG.getIntPtrConstant(0, dl),
14897 MachinePointerInfo(Ptr));
14899 unsigned char OperandFlags = 0;
14900 // Most TLS accesses are not RIP relative, even on x86-64. One exception is
14902 unsigned WrapperKind = X86ISD::Wrapper;
14903 if (model == TLSModel::LocalExec) {
14904 OperandFlags = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
14905 } else if (model == TLSModel::InitialExec) {
14907 OperandFlags = X86II::MO_GOTTPOFF;
14908 WrapperKind = X86ISD::WrapperRIP;
14910 OperandFlags = isPIC ? X86II::MO_GOTNTPOFF : X86II::MO_INDNTPOFF;
14913 llvm_unreachable("Unexpected model");
14916 // emit "addl x@ntpoff,%eax" (local exec)
14917 // or "addl x@indntpoff,%eax" (initial exec)
14918 // or "addl x@gotntpoff(%ebx) ,%eax" (initial exec, 32-bit pic)
14920 DAG.getTargetGlobalAddress(GA->getGlobal(), dl, GA->getValueType(0),
14921 GA->getOffset(), OperandFlags);
14922 SDValue Offset = DAG.getNode(WrapperKind, dl, PtrVT, TGA);
14924 if (model == TLSModel::InitialExec) {
14925 if (isPIC && !is64Bit) {
14926 Offset = DAG.getNode(ISD::ADD, dl, PtrVT,
14927 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14931 Offset = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Offset,
14932 MachinePointerInfo::getGOT(DAG.getMachineFunction()));
14935 // The address of the thread local variable is the add of the thread
14936 // pointer with the offset of the variable.
14937 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
14941 X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
14943 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
14945 if (DAG.getTarget().Options.EmulatedTLS)
14946 return LowerToTLSEmulatedModel(GA, DAG);
14948 const GlobalValue *GV = GA->getGlobal();
14949 auto PtrVT = getPointerTy(DAG.getDataLayout());
14950 bool PositionIndependent = isPositionIndependent();
14952 if (Subtarget.isTargetELF()) {
14953 TLSModel::Model model = DAG.getTarget().getTLSModel(GV);
14955 case TLSModel::GeneralDynamic:
14956 if (Subtarget.is64Bit())
14957 return LowerToTLSGeneralDynamicModel64(GA, DAG, PtrVT);
14958 return LowerToTLSGeneralDynamicModel32(GA, DAG, PtrVT);
14959 case TLSModel::LocalDynamic:
14960 return LowerToTLSLocalDynamicModel(GA, DAG, PtrVT,
14961 Subtarget.is64Bit());
14962 case TLSModel::InitialExec:
14963 case TLSModel::LocalExec:
14964 return LowerToTLSExecModel(GA, DAG, PtrVT, model, Subtarget.is64Bit(),
14965 PositionIndependent);
14967 llvm_unreachable("Unknown TLS model.");
14970 if (Subtarget.isTargetDarwin()) {
14971 // Darwin only has one model of TLS. Lower to that.
14972 unsigned char OpFlag = 0;
14973 unsigned WrapperKind = Subtarget.isPICStyleRIPRel() ?
14974 X86ISD::WrapperRIP : X86ISD::Wrapper;
14976 // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
14977 // global base reg.
14978 bool PIC32 = PositionIndependent && !Subtarget.is64Bit();
14980 OpFlag = X86II::MO_TLVP_PIC_BASE;
14982 OpFlag = X86II::MO_TLVP;
14984 SDValue Result = DAG.getTargetGlobalAddress(GA->getGlobal(), DL,
14985 GA->getValueType(0),
14986 GA->getOffset(), OpFlag);
14987 SDValue Offset = DAG.getNode(WrapperKind, DL, PtrVT, Result);
14989 // With PIC32, the address is actually $g + Offset.
14991 Offset = DAG.getNode(ISD::ADD, DL, PtrVT,
14992 DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(), PtrVT),
14995 // Lowering the machine isd will make sure everything is in the right
14997 SDValue Chain = DAG.getEntryNode();
14998 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
14999 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
15000 SDValue Args[] = { Chain, Offset };
15001 Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args);
15002 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
15003 DAG.getIntPtrConstant(0, DL, true),
15004 Chain.getValue(1), DL);
15006 // TLSCALL will be codegen'ed as call. Inform MFI that function has calls.
15007 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
15008 MFI.setAdjustsStack(true);
15010 // And our return value (tls address) is in the standard call return value
15012 unsigned Reg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
15013 return DAG.getCopyFromReg(Chain, DL, Reg, PtrVT, Chain.getValue(1));
15016 if (Subtarget.isTargetKnownWindowsMSVC() ||
15017 Subtarget.isTargetWindowsItanium() ||
15018 Subtarget.isTargetWindowsGNU()) {
15019 // Just use the implicit TLS architecture
15020 // Need to generate something similar to:
15021 // mov rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
15023 // mov ecx, dword [rel _tls_index]: Load index (from C runtime)
15024 // mov rcx, qword [rdx+rcx*8]
15025 // mov eax, .tls$:tlsvar
15026 // [rax+rcx] contains the address
15027 // Windows 64bit: gs:0x58
15028 // Windows 32bit: fs:__tls_array
15031 SDValue Chain = DAG.getEntryNode();
15033 // Get the Thread Pointer, which is %fs:__tls_array (32-bit) or
15034 // %gs:0x58 (64-bit). On MinGW, __tls_array is not available, so directly
15035 // use its literal value of 0x2C.
15036 Value *Ptr = Constant::getNullValue(Subtarget.is64Bit()
15037 ? Type::getInt8PtrTy(*DAG.getContext(),
15039 : Type::getInt32PtrTy(*DAG.getContext(),
15042 SDValue TlsArray = Subtarget.is64Bit()
15043 ? DAG.getIntPtrConstant(0x58, dl)
15044 : (Subtarget.isTargetWindowsGNU()
15045 ? DAG.getIntPtrConstant(0x2C, dl)
15046 : DAG.getExternalSymbol("_tls_array", PtrVT));
15048 SDValue ThreadPointer =
15049 DAG.getLoad(PtrVT, dl, Chain, TlsArray, MachinePointerInfo(Ptr));
15052 if (GV->getThreadLocalMode() == GlobalVariable::LocalExecTLSModel) {
15053 res = ThreadPointer;
15055 // Load the _tls_index variable
15056 SDValue IDX = DAG.getExternalSymbol("_tls_index", PtrVT);
15057 if (Subtarget.is64Bit())
15058 IDX = DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain, IDX,
15059 MachinePointerInfo(), MVT::i32);
15061 IDX = DAG.getLoad(PtrVT, dl, Chain, IDX, MachinePointerInfo());
15063 auto &DL = DAG.getDataLayout();
15065 DAG.getConstant(Log2_64_Ceil(DL.getPointerSize()), dl, PtrVT);
15066 IDX = DAG.getNode(ISD::SHL, dl, PtrVT, IDX, Scale);
15068 res = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, IDX);
15071 res = DAG.getLoad(PtrVT, dl, Chain, res, MachinePointerInfo());
15073 // Get the offset of start of .tls section
15074 SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
15075 GA->getValueType(0),
15076 GA->getOffset(), X86II::MO_SECREL);
15077 SDValue Offset = DAG.getNode(X86ISD::Wrapper, dl, PtrVT, TGA);
15079 // The address of the thread local variable is the add of the thread
15080 // pointer with the offset of the variable.
15081 return DAG.getNode(ISD::ADD, dl, PtrVT, res, Offset);
15084 llvm_unreachable("TLS not implemented for this target.");
15087 /// Lower SRA_PARTS and friends, which return two i32 values
15088 /// and take a 2 x i32 value to shift plus a shift amount.
15089 static SDValue LowerShiftParts(SDValue Op, SelectionDAG &DAG) {
15090 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
15091 MVT VT = Op.getSimpleValueType();
15092 unsigned VTBits = VT.getSizeInBits();
15094 bool isSRA = Op.getOpcode() == ISD::SRA_PARTS;
15095 SDValue ShOpLo = Op.getOperand(0);
15096 SDValue ShOpHi = Op.getOperand(1);
15097 SDValue ShAmt = Op.getOperand(2);
15098 // X86ISD::SHLD and X86ISD::SHRD have defined overflow behavior but the
15099 // generic ISD nodes haven't. Insert an AND to be safe, it's optimized away
15101 SDValue SafeShAmt = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15102 DAG.getConstant(VTBits - 1, dl, MVT::i8));
15103 SDValue Tmp1 = isSRA ? DAG.getNode(ISD::SRA, dl, VT, ShOpHi,
15104 DAG.getConstant(VTBits - 1, dl, MVT::i8))
15105 : DAG.getConstant(0, dl, VT);
15107 SDValue Tmp2, Tmp3;
15108 if (Op.getOpcode() == ISD::SHL_PARTS) {
15109 Tmp2 = DAG.getNode(X86ISD::SHLD, dl, VT, ShOpHi, ShOpLo, ShAmt);
15110 Tmp3 = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, SafeShAmt);
15112 Tmp2 = DAG.getNode(X86ISD::SHRD, dl, VT, ShOpLo, ShOpHi, ShAmt);
15113 Tmp3 = DAG.getNode(isSRA ? ISD::SRA : ISD::SRL, dl, VT, ShOpHi, SafeShAmt);
15116 // If the shift amount is larger or equal than the width of a part we can't
15117 // rely on the results of shld/shrd. Insert a test and select the appropriate
15118 // values for large shift amounts.
15119 SDValue AndNode = DAG.getNode(ISD::AND, dl, MVT::i8, ShAmt,
15120 DAG.getConstant(VTBits, dl, MVT::i8));
15121 SDValue Cond = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
15122 AndNode, DAG.getConstant(0, dl, MVT::i8));
15125 SDValue CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
15126 SDValue Ops0[4] = { Tmp2, Tmp3, CC, Cond };
15127 SDValue Ops1[4] = { Tmp3, Tmp1, CC, Cond };
15129 if (Op.getOpcode() == ISD::SHL_PARTS) {
15130 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15131 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15133 Lo = DAG.getNode(X86ISD::CMOV, dl, VT, Ops0);
15134 Hi = DAG.getNode(X86ISD::CMOV, dl, VT, Ops1);
15137 SDValue Ops[2] = { Lo, Hi };
15138 return DAG.getMergeValues(Ops, dl);
15141 SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
15142 SelectionDAG &DAG) const {
15143 SDValue Src = Op.getOperand(0);
15144 MVT SrcVT = Src.getSimpleValueType();
15145 MVT VT = Op.getSimpleValueType();
15148 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15149 if (SrcVT.isVector()) {
15150 if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
15151 return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
15152 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
15153 DAG.getUNDEF(SrcVT)));
15155 if (SrcVT.getVectorElementType() == MVT::i1) {
15156 if (SrcVT == MVT::v2i1 && TLI.isTypeLegal(SrcVT))
15157 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15158 DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v2i64, Src));
15159 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15160 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15161 DAG.getNode(ISD::SIGN_EXTEND, dl, IntegerVT, Src));
15166 assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
15167 "Unknown SINT_TO_FP to lower!");
15169 // These are really Legal; return the operand so the caller accepts it as
15171 if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(Op.getValueType()))
15173 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15174 Subtarget.is64Bit()) {
15178 SDValue ValueToStore = Op.getOperand(0);
15179 if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(Op.getValueType()) &&
15180 !Subtarget.is64Bit())
15181 // Bitcasting to f64 here allows us to do a single 64-bit store from
15182 // an SSE register, avoiding the store forwarding penalty that would come
15183 // with two 32-bit stores.
15184 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15186 unsigned Size = SrcVT.getSizeInBits()/8;
15187 MachineFunction &MF = DAG.getMachineFunction();
15188 auto PtrVT = getPointerTy(MF.getDataLayout());
15189 int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
15190 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15191 SDValue Chain = DAG.getStore(
15192 DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15193 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15194 return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
15197 SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
15199 SelectionDAG &DAG) const {
15203 bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
15205 Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
15207 Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
15209 unsigned ByteSize = SrcVT.getSizeInBits()/8;
15211 FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
15212 MachineMemOperand *MMO;
15214 int SSFI = FI->getIndex();
15215 MMO = DAG.getMachineFunction().getMachineMemOperand(
15216 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15217 MachineMemOperand::MOLoad, ByteSize, ByteSize);
15219 MMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
15220 StackSlot = StackSlot.getOperand(1);
15222 SDValue Ops[] = { Chain, StackSlot, DAG.getValueType(SrcVT) };
15223 SDValue Result = DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG :
15225 Tys, Ops, SrcVT, MMO);
15228 Chain = Result.getValue(1);
15229 SDValue InFlag = Result.getValue(2);
15231 // FIXME: Currently the FST is flagged to the FILD_FLAG. This
15232 // shouldn't be necessary except that RFP cannot be live across
15233 // multiple blocks. When stackifier is fixed, they can be uncoupled.
15234 MachineFunction &MF = DAG.getMachineFunction();
15235 unsigned SSFISize = Op.getValueSizeInBits()/8;
15236 int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
15237 auto PtrVT = getPointerTy(MF.getDataLayout());
15238 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15239 Tys = DAG.getVTList(MVT::Other);
15241 Chain, Result, StackSlot, DAG.getValueType(Op.getValueType()), InFlag
15243 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15244 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15245 MachineMemOperand::MOStore, SSFISize, SSFISize);
15247 Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys,
15248 Ops, Op.getValueType(), MMO);
15249 Result = DAG.getLoad(
15250 Op.getValueType(), DL, Chain, StackSlot,
15251 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
15257 /// 64-bit unsigned integer to double expansion.
15258 SDValue X86TargetLowering::LowerUINT_TO_FP_i64(SDValue Op,
15259 SelectionDAG &DAG) const {
15260 // This algorithm is not obvious. Here it is what we're trying to output:
15263 punpckldq (c0), %xmm0 // c0: (uint4){ 0x43300000U, 0x45300000U, 0U, 0U }
15264 subpd (c1), %xmm0 // c1: (double2){ 0x1.0p52, 0x1.0p52 * 0x1.0p32 }
15266 haddpd %xmm0, %xmm0
15268 pshufd $0x4e, %xmm0, %xmm1
15274 LLVMContext *Context = DAG.getContext();
15276 // Build some magic constants.
15277 static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
15278 Constant *C0 = ConstantDataVector::get(*Context, CV0);
15279 auto PtrVT = getPointerTy(DAG.getDataLayout());
15280 SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
15282 SmallVector<Constant*,2> CV1;
15284 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15285 APInt(64, 0x4330000000000000ULL))));
15287 ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
15288 APInt(64, 0x4530000000000000ULL))));
15289 Constant *C1 = ConstantVector::get(CV1);
15290 SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
15292 // Load the 64-bit value into an XMM register.
15293 SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
15296 DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
15297 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15298 /* Alignment = */ 16);
15300 getUnpackl(DAG, dl, MVT::v4i32, DAG.getBitcast(MVT::v4i32, XR1), CLod0);
15303 DAG.getLoad(MVT::v2f64, dl, CLod0.getValue(1), CPIdx1,
15304 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
15305 /* Alignment = */ 16);
15306 SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
15307 // TODO: Are there any fast-math-flags to propagate here?
15308 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
15311 if (Subtarget.hasSSE3()) {
15312 // FIXME: The 'haddpd' instruction may be slower than 'movhlps + addsd'.
15313 Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
15315 SDValue S2F = DAG.getBitcast(MVT::v4i32, Sub);
15316 SDValue Shuffle = DAG.getVectorShuffle(MVT::v4i32, dl, S2F, S2F, {2,3,0,1});
15317 Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64,
15318 DAG.getBitcast(MVT::v2f64, Shuffle), Sub);
15321 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
15322 DAG.getIntPtrConstant(0, dl));
15325 /// 32-bit unsigned integer to float expansion.
15326 SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
15327 SelectionDAG &DAG) const {
15329 // FP constant to bias correct the final result.
15330 SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
15333 // Load the 32-bit value into an XMM register.
15334 SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
15337 // Zero out the upper parts of the register.
15338 Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
15340 Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15341 DAG.getBitcast(MVT::v2f64, Load),
15342 DAG.getIntPtrConstant(0, dl));
15344 // Or the load with the bias.
15345 SDValue Or = DAG.getNode(
15346 ISD::OR, dl, MVT::v2i64,
15347 DAG.getBitcast(MVT::v2i64,
15348 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
15349 DAG.getBitcast(MVT::v2i64,
15350 DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
15352 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15353 DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
15355 // Subtract the bias.
15356 // TODO: Are there any fast-math-flags to propagate here?
15357 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
15359 // Handle final rounding.
15360 MVT DestVT = Op.getSimpleValueType();
15362 if (DestVT.bitsLT(MVT::f64))
15363 return DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
15364 DAG.getIntPtrConstant(0, dl));
15365 if (DestVT.bitsGT(MVT::f64))
15366 return DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
15368 // Handle final rounding.
15372 static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
15373 const X86Subtarget &Subtarget, SDLoc &DL) {
15374 if (Op.getSimpleValueType() != MVT::v2f64)
15377 SDValue N0 = Op.getOperand(0);
15378 assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
15380 // Legalize to v4i32 type.
15381 N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
15382 DAG.getUNDEF(MVT::v2i32));
15384 if (Subtarget.hasAVX512())
15385 return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
15387 // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
15388 // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
15389 SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
15390 SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
15392 // Two to the power of half-word-size.
15393 SDValue TWOHW = DAG.getConstantFP(1 << 16, DL, MVT::v2f64);
15395 // Clear upper part of LO, lower HI.
15396 SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
15397 SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
15399 SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
15400 fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
15401 SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
15403 // Add the two halves.
15404 return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
15407 static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
15408 const X86Subtarget &Subtarget) {
15409 // The algorithm is the following:
15410 // #ifdef __SSE4_1__
15411 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15412 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15413 // (uint4) 0x53000000, 0xaa);
15415 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15416 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15418 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15419 // return (float4) lo + fhi;
15421 // We shouldn't use it when unsafe-fp-math is enabled though: we might later
15422 // reassociate the two FADDs, and if we do that, the algorithm fails
15423 // spectacularly (PR24512).
15424 // FIXME: If we ever have some kind of Machine FMF, this should be marked
15425 // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
15426 // there's also the MachineCombiner reassociations happening on Machine IR.
15427 if (DAG.getTarget().Options.UnsafeFPMath)
15431 SDValue V = Op->getOperand(0);
15432 MVT VecIntVT = V.getSimpleValueType();
15433 bool Is128 = VecIntVT == MVT::v4i32;
15434 MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
15435 // If we convert to something else than the supported type, e.g., to v4f64,
15437 if (VecFloatVT != Op->getSimpleValueType(0))
15440 assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
15441 "Unsupported custom type");
15443 // In the #idef/#else code, we have in common:
15444 // - The vector of constants:
15450 // Create the splat vector for 0x4b000000.
15451 SDValue VecCstLow = DAG.getConstant(0x4b000000, DL, VecIntVT);
15452 // Create the splat vector for 0x53000000.
15453 SDValue VecCstHigh = DAG.getConstant(0x53000000, DL, VecIntVT);
15455 // Create the right shift.
15456 SDValue VecCstShift = DAG.getConstant(16, DL, VecIntVT);
15457 SDValue HighShift = DAG.getNode(ISD::SRL, DL, VecIntVT, V, VecCstShift);
15460 if (Subtarget.hasSSE41()) {
15461 MVT VecI16VT = Is128 ? MVT::v8i16 : MVT::v16i16;
15462 // uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
15463 SDValue VecCstLowBitcast = DAG.getBitcast(VecI16VT, VecCstLow);
15464 SDValue VecBitcast = DAG.getBitcast(VecI16VT, V);
15465 // Low will be bitcasted right away, so do not bother bitcasting back to its
15467 Low = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecBitcast,
15468 VecCstLowBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15469 // uint4 hi = _mm_blend_epi16( _mm_srli_epi32(v,16),
15470 // (uint4) 0x53000000, 0xaa);
15471 SDValue VecCstHighBitcast = DAG.getBitcast(VecI16VT, VecCstHigh);
15472 SDValue VecShiftBitcast = DAG.getBitcast(VecI16VT, HighShift);
15473 // High will be bitcasted right away, so do not bother bitcasting back to
15474 // its original type.
15475 High = DAG.getNode(X86ISD::BLENDI, DL, VecI16VT, VecShiftBitcast,
15476 VecCstHighBitcast, DAG.getConstant(0xaa, DL, MVT::i32));
15478 SDValue VecCstMask = DAG.getConstant(0xffff, DL, VecIntVT);
15479 // uint4 lo = (v & (uint4) 0xffff) | (uint4) 0x4b000000;
15480 SDValue LowAnd = DAG.getNode(ISD::AND, DL, VecIntVT, V, VecCstMask);
15481 Low = DAG.getNode(ISD::OR, DL, VecIntVT, LowAnd, VecCstLow);
15483 // uint4 hi = (v >> 16) | (uint4) 0x53000000;
15484 High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
15487 // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
15488 SDValue VecCstFAdd = DAG.getConstantFP(
15489 APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
15491 // float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
15492 SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
15493 // TODO: Are there any fast-math-flags to propagate here?
15495 DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
15496 // return (float4) lo + fhi;
15497 SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
15498 return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
15501 SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
15502 SelectionDAG &DAG) const {
15503 SDValue N0 = Op.getOperand(0);
15504 MVT SrcVT = N0.getSimpleValueType();
15507 if (SrcVT.getVectorElementType() == MVT::i1) {
15508 if (SrcVT == MVT::v2i1)
15509 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15510 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, N0));
15511 MVT IntegerVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15512 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15513 DAG.getNode(ISD::ZERO_EXTEND, dl, IntegerVT, N0));
15516 switch (SrcVT.SimpleTy) {
15518 llvm_unreachable("Custom UINT_TO_FP is not supported!");
15523 MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
15524 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
15525 DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
15528 return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
15531 return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
15534 assert(Subtarget.hasAVX512());
15535 return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
15536 DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
15540 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
15541 SelectionDAG &DAG) const {
15542 SDValue N0 = Op.getOperand(0);
15544 auto PtrVT = getPointerTy(DAG.getDataLayout());
15546 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
15547 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
15548 // the optimization here.
15549 if (DAG.SignBitIsZero(N0))
15550 return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
15552 if (Op.getSimpleValueType().isVector())
15553 return lowerUINT_TO_FP_vec(Op, DAG);
15555 MVT SrcVT = N0.getSimpleValueType();
15556 MVT DstVT = Op.getSimpleValueType();
15558 if (Subtarget.hasAVX512() && isScalarFPTypeInSSEReg(DstVT) &&
15559 (SrcVT == MVT::i32 || (SrcVT == MVT::i64 && Subtarget.is64Bit()))) {
15560 // Conversions from unsigned i32 to f32/f64 are legal,
15561 // using VCVTUSI2SS/SD. Same for i64 in 64-bit mode.
15565 if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
15566 return LowerUINT_TO_FP_i64(Op, DAG);
15567 if (SrcVT == MVT::i32 && X86ScalarSSEf64)
15568 return LowerUINT_TO_FP_i32(Op, DAG);
15569 if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
15572 // Make a 64-bit buffer, and use it to build an FILD.
15573 SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
15574 if (SrcVT == MVT::i32) {
15575 SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
15576 SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
15577 StackSlot, MachinePointerInfo());
15578 SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
15579 OffsetSlot, MachinePointerInfo());
15580 SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
15584 assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
15585 SDValue ValueToStore = Op.getOperand(0);
15586 if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
15587 // Bitcasting to f64 here allows us to do a single 64-bit store from
15588 // an SSE register, avoiding the store forwarding penalty that would come
15589 // with two 32-bit stores.
15590 ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
15591 SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
15592 MachinePointerInfo());
15593 // For i64 source, we need to add the appropriate power of 2 if the input
15594 // was negative. This is the same as the optimization in
15595 // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
15596 // we must be careful to do the computation in x87 extended precision, not
15597 // in SSE. (The generic code can't know it's OK to do this, or how to.)
15598 int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
15599 MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
15600 MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
15601 MachineMemOperand::MOLoad, 8, 8);
15603 SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
15604 SDValue Ops[] = { Store, StackSlot, DAG.getValueType(MVT::i64) };
15605 SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
15608 APInt FF(32, 0x5F800000ULL);
15610 // Check whether the sign bit is set.
15611 SDValue SignSet = DAG.getSetCC(
15612 dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
15613 Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
15615 // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
15616 SDValue FudgePtr = DAG.getConstantPool(
15617 ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
15619 // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
15620 SDValue Zero = DAG.getIntPtrConstant(0, dl);
15621 SDValue Four = DAG.getIntPtrConstant(4, dl);
15622 SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
15623 FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
15625 // Load the value out, extending it from f32 to f80.
15626 // FIXME: Avoid the extend by constructing the right constant pool?
15627 SDValue Fudge = DAG.getExtLoad(
15628 ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
15629 MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
15630 /* Alignment = */ 4);
15631 // Extend everything to 80 bits to force it to be done on x87.
15632 // TODO: Are there any fast-math-flags to propagate here?
15633 SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
15634 return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
15635 DAG.getIntPtrConstant(0, dl));
15638 // If the given FP_TO_SINT (IsSigned) or FP_TO_UINT (!IsSigned) operation
15639 // is legal, or has an fp128 or f16 source (which needs to be promoted to f32),
15640 // just return an <SDValue(), SDValue()> pair.
15641 // Otherwise it is assumed to be a conversion from one of f32, f64 or f80
15642 // to i16, i32 or i64, and we lower it to a legal sequence.
15643 // If lowered to the final integer result we return a <result, SDValue()> pair.
15644 // Otherwise we lower it to a sequence ending with a FIST, return a
15645 // <FIST, StackSlot> pair, and the caller is responsible for loading
15646 // the final integer result from StackSlot.
15647 std::pair<SDValue,SDValue>
15648 X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
15649 bool IsSigned, bool IsReplace) const {
15652 EVT DstTy = Op.getValueType();
15653 EVT TheVT = Op.getOperand(0).getValueType();
15654 auto PtrVT = getPointerTy(DAG.getDataLayout());
15656 if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
15657 // f16 must be promoted before using the lowering in this routine.
15658 // fp128 does not use this lowering.
15659 return std::make_pair(SDValue(), SDValue());
15662 // If using FIST to compute an unsigned i64, we'll need some fixup
15663 // to handle values above the maximum signed i64. A FIST is always
15664 // used for the 32-bit subtarget, but also for f80 on a 64-bit target.
15665 bool UnsignedFixup = !IsSigned &&
15666 DstTy == MVT::i64 &&
15667 (!Subtarget.is64Bit() ||
15668 !isScalarFPTypeInSSEReg(TheVT));
15670 if (!IsSigned && DstTy != MVT::i64 && !Subtarget.hasAVX512()) {
15671 // Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
15672 // The low 32 bits of the fist result will have the correct uint32 result.
15673 assert(DstTy == MVT::i32 && "Unexpected FP_TO_UINT");
15677 assert(DstTy.getSimpleVT() <= MVT::i64 &&
15678 DstTy.getSimpleVT() >= MVT::i16 &&
15679 "Unknown FP_TO_INT to lower!");
15681 // These are really Legal.
15682 if (DstTy == MVT::i32 &&
15683 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15684 return std::make_pair(SDValue(), SDValue());
15685 if (Subtarget.is64Bit() &&
15686 DstTy == MVT::i64 &&
15687 isScalarFPTypeInSSEReg(Op.getOperand(0).getValueType()))
15688 return std::make_pair(SDValue(), SDValue());
15690 // We lower FP->int64 into FISTP64 followed by a load from a temporary
15692 MachineFunction &MF = DAG.getMachineFunction();
15693 unsigned MemSize = DstTy.getSizeInBits()/8;
15694 int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15695 SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15698 switch (DstTy.getSimpleVT().SimpleTy) {
15699 default: llvm_unreachable("Invalid FP_TO_SINT to lower!");
15700 case MVT::i16: Opc = X86ISD::FP_TO_INT16_IN_MEM; break;
15701 case MVT::i32: Opc = X86ISD::FP_TO_INT32_IN_MEM; break;
15702 case MVT::i64: Opc = X86ISD::FP_TO_INT64_IN_MEM; break;
15705 SDValue Chain = DAG.getEntryNode();
15706 SDValue Value = Op.getOperand(0);
15707 SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
15709 if (UnsignedFixup) {
15711 // Conversion to unsigned i64 is implemented with a select,
15712 // depending on whether the source value fits in the range
15713 // of a signed i64. Let Thresh be the FP equivalent of
15714 // 0x8000000000000000ULL.
15716 // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
15717 // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
15718 // Fist-to-mem64 FistSrc
15719 // Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
15720 // to XOR'ing the high 32 bits with Adjust.
15722 // Being a power of 2, Thresh is exactly representable in all FP formats.
15723 // For X87 we'd like to use the smallest FP type for this constant, but
15724 // for DAG type consistency we have to match the FP operand type.
15726 APFloat Thresh(APFloat::IEEEsingle(), APInt(32, 0x5f000000));
15727 LLVM_ATTRIBUTE_UNUSED APFloat::opStatus Status = APFloat::opOK;
15728 bool LosesInfo = false;
15729 if (TheVT == MVT::f64)
15730 // The rounding mode is irrelevant as the conversion should be exact.
15731 Status = Thresh.convert(APFloat::IEEEdouble(), APFloat::rmNearestTiesToEven,
15733 else if (TheVT == MVT::f80)
15734 Status = Thresh.convert(APFloat::x87DoubleExtended(),
15735 APFloat::rmNearestTiesToEven, &LosesInfo);
15737 assert(Status == APFloat::opOK && !LosesInfo &&
15738 "FP conversion should have been exact");
15740 SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
15742 SDValue Cmp = DAG.getSetCC(DL,
15743 getSetCCResultType(DAG.getDataLayout(),
15744 *DAG.getContext(), TheVT),
15745 Value, ThreshVal, ISD::SETLT);
15746 Adjust = DAG.getSelect(DL, MVT::i32, Cmp,
15747 DAG.getConstant(0, DL, MVT::i32),
15748 DAG.getConstant(0x80000000, DL, MVT::i32));
15749 SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
15750 Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
15751 *DAG.getContext(), TheVT),
15752 Value, ThreshVal, ISD::SETLT);
15753 Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
15756 // FIXME This causes a redundant load/store if the SSE-class value is already
15757 // in memory, such as if it is on the callstack.
15758 if (isScalarFPTypeInSSEReg(TheVT)) {
15759 assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
15760 Chain = DAG.getStore(Chain, DL, Value, StackSlot,
15761 MachinePointerInfo::getFixedStack(MF, SSFI));
15762 SDVTList Tys = DAG.getVTList(Op.getOperand(0).getValueType(), MVT::Other);
15764 Chain, StackSlot, DAG.getValueType(TheVT)
15767 MachineMemOperand *MMO =
15768 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15769 MachineMemOperand::MOLoad, MemSize, MemSize);
15770 Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, DstTy, MMO);
15771 Chain = Value.getValue(1);
15772 SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
15773 StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
15776 MachineMemOperand *MMO =
15777 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
15778 MachineMemOperand::MOStore, MemSize, MemSize);
15780 if (UnsignedFixup) {
15782 // Insert the FIST, load its result as two i32's,
15783 // and XOR the high i32 with Adjust.
15785 SDValue FistOps[] = { Chain, Value, StackSlot };
15786 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15787 FistOps, DstTy, MMO);
15790 DAG.getLoad(MVT::i32, DL, FIST, StackSlot, MachinePointerInfo());
15791 SDValue HighAddr = DAG.getMemBasePlusOffset(StackSlot, 4, DL);
15794 DAG.getLoad(MVT::i32, DL, FIST, HighAddr, MachinePointerInfo());
15795 High32 = DAG.getNode(ISD::XOR, DL, MVT::i32, High32, Adjust);
15797 if (Subtarget.is64Bit()) {
15798 // Join High32 and Low32 into a 64-bit result.
15799 // (High32 << 32) | Low32
15800 Low32 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, Low32);
15801 High32 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, High32);
15802 High32 = DAG.getNode(ISD::SHL, DL, MVT::i64, High32,
15803 DAG.getConstant(32, DL, MVT::i8));
15804 SDValue Result = DAG.getNode(ISD::OR, DL, MVT::i64, High32, Low32);
15805 return std::make_pair(Result, SDValue());
15808 SDValue ResultOps[] = { Low32, High32 };
15810 SDValue pair = IsReplace
15811 ? DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResultOps)
15812 : DAG.getMergeValues(ResultOps, DL);
15813 return std::make_pair(pair, SDValue());
15815 // Build the FP_TO_INT*_IN_MEM
15816 SDValue Ops[] = { Chain, Value, StackSlot };
15817 SDValue FIST = DAG.getMemIntrinsicNode(Opc, DL, DAG.getVTList(MVT::Other),
15819 return std::make_pair(FIST, StackSlot);
15823 static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
15824 const X86Subtarget &Subtarget) {
15825 MVT VT = Op->getSimpleValueType(0);
15826 SDValue In = Op->getOperand(0);
15827 MVT InVT = In.getSimpleValueType();
15830 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
15831 return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
15833 // Optimize vectors in AVX mode:
15836 // Use vpunpcklwd for 4 lower elements v8i16 -> v4i32.
15837 // Use vpunpckhwd for 4 upper elements v8i16 -> v4i32.
15838 // Concat upper and lower parts.
15841 // Use vpunpckldq for 4 lower elements v4i32 -> v2i64.
15842 // Use vpunpckhdq for 4 upper elements v4i32 -> v2i64.
15843 // Concat upper and lower parts.
15846 if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
15847 ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
15848 ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
15851 if (Subtarget.hasInt256())
15852 return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
15854 SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
15855 SDValue Undef = DAG.getUNDEF(InVT);
15856 bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
15857 SDValue OpLo = getUnpackl(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15858 SDValue OpHi = getUnpackh(DAG, dl, InVT, In, NeedZero ? ZeroVec : Undef);
15860 MVT HVT = MVT::getVectorVT(VT.getVectorElementType(),
15861 VT.getVectorNumElements()/2);
15863 OpLo = DAG.getBitcast(HVT, OpLo);
15864 OpHi = DAG.getBitcast(HVT, OpHi);
15866 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
15869 static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
15870 const X86Subtarget &Subtarget, SelectionDAG &DAG) {
15871 MVT VT = Op->getSimpleValueType(0);
15872 SDValue In = Op->getOperand(0);
15873 MVT InVT = In.getSimpleValueType();
15875 unsigned NumElts = VT.getVectorNumElements();
15877 if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
15878 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
15879 return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
15881 if (InVT.getVectorElementType() != MVT::i1)
15884 // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
15886 if (!VT.is512BitVector() && !Subtarget.hasVLX())
15887 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
15890 DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
15892 DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
15894 SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
15896 return SelectedVal;
15897 return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
15900 static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15901 SelectionDAG &DAG) {
15902 if (Subtarget.hasFp256())
15903 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15909 static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
15910 SelectionDAG &DAG) {
15912 MVT VT = Op.getSimpleValueType();
15913 SDValue In = Op.getOperand(0);
15914 MVT SVT = In.getSimpleValueType();
15916 if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
15917 return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
15919 if (Subtarget.hasFp256())
15920 if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
15923 assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
15924 VT.getVectorNumElements() != SVT.getVectorNumElements());
15928 /// Helper to recursively truncate vector elements in half with PACKSS.
15929 /// It makes use of the fact that vector comparison results will be all-zeros
15930 /// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
15931 /// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
15932 /// within each 128-bit lane.
15933 static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
15936 const X86Subtarget &Subtarget) {
15937 // Requires SSE2 but AVX512 has fast truncate.
15938 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
15941 EVT SrcVT = In.getValueType();
15943 // No truncation required, we might get here due to recursive calls.
15944 if (SrcVT == DstVT)
15947 // We only support vector truncation to 128bits or greater from a
15948 // 256bits or greater source.
15949 if ((DstVT.getSizeInBits() % 128) != 0)
15951 if ((SrcVT.getSizeInBits() % 256) != 0)
15954 unsigned NumElems = SrcVT.getVectorNumElements();
15955 assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
15956 assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
15959 EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
15961 // Extract lower/upper subvectors.
15962 unsigned NumSubElts = NumElems / 2;
15963 unsigned SrcSizeInBits = SrcVT.getSizeInBits();
15964 SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15965 SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
15967 // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
15968 if (SrcVT.is256BitVector()) {
15969 Lo = DAG.getBitcast(MVT::v8i16, Lo);
15970 Hi = DAG.getBitcast(MVT::v8i16, Hi);
15971 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
15972 return DAG.getBitcast(DstVT, Res);
15975 // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
15976 // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
15977 if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
15978 Lo = DAG.getBitcast(MVT::v16i16, Lo);
15979 Hi = DAG.getBitcast(MVT::v16i16, Hi);
15980 SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
15982 // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
15983 // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
15984 Res = DAG.getBitcast(MVT::v4i64, Res);
15985 Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
15987 if (DstVT.is256BitVector())
15988 return DAG.getBitcast(DstVT, Res);
15990 // If 512bit -> 128bit truncate another stage.
15991 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
15992 Res = DAG.getBitcast(PackedVT, Res);
15993 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
15996 // Recursively pack lower/upper subvectors, concat result and pack again.
15997 assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
15998 EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
15999 Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
16000 Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
16002 PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
16003 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
16004 return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
16007 static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
16008 const X86Subtarget &Subtarget) {
16011 MVT VT = Op.getSimpleValueType();
16012 SDValue In = Op.getOperand(0);
16013 MVT InVT = In.getSimpleValueType();
16015 assert(VT.getVectorElementType() == MVT::i1 && "Unexpected vector type.");
16017 // Shift LSB to MSB and use VPMOVB/W2M or TESTD/Q.
16018 unsigned ShiftInx = InVT.getScalarSizeInBits() - 1;
16019 if (InVT.getScalarSizeInBits() <= 16) {
16020 if (Subtarget.hasBWI()) {
16021 // legal, will go to VPMOVB2M, VPMOVW2M
16022 // Shift packed bytes not supported natively, bitcast to word
16023 MVT ExtVT = MVT::getVectorVT(MVT::i16, InVT.getSizeInBits()/16);
16024 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, ExtVT,
16025 DAG.getBitcast(ExtVT, In),
16026 DAG.getConstant(ShiftInx, DL, ExtVT));
16027 ShiftNode = DAG.getBitcast(InVT, ShiftNode);
16028 return DAG.getNode(X86ISD::CVT2MASK, DL, VT, ShiftNode);
16030 // Use TESTD/Q, extended vector to packed dword/qword.
16031 assert((InVT.is256BitVector() || InVT.is128BitVector()) &&
16032 "Unexpected vector type.");
16033 unsigned NumElts = InVT.getVectorNumElements();
16034 MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
16035 In = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, In);
16037 ShiftInx = InVT.getScalarSizeInBits() - 1;
16040 SDValue ShiftNode = DAG.getNode(ISD::SHL, DL, InVT, In,
16041 DAG.getConstant(ShiftInx, DL, InVT));
16042 return DAG.getNode(X86ISD::TESTM, DL, VT, ShiftNode, ShiftNode);
16045 SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
16047 MVT VT = Op.getSimpleValueType();
16048 SDValue In = Op.getOperand(0);
16049 MVT InVT = In.getSimpleValueType();
16051 if (VT == MVT::i1) {
16052 assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
16053 "Invalid scalar TRUNCATE operation");
16054 if (InVT.getSizeInBits() >= 32)
16056 In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
16057 return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
16059 assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
16060 "Invalid TRUNCATE operation");
16062 if (VT.getVectorElementType() == MVT::i1)
16063 return LowerTruncateVecI1(Op, DAG, Subtarget);
16065 // vpmovqb/w/d, vpmovdb/w, vpmovwb
16066 if (Subtarget.hasAVX512()) {
16067 // word to byte only under BWI
16068 if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
16069 return DAG.getNode(X86ISD::VTRUNC, DL, VT,
16070 getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
16071 return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
16074 // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
16075 if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
16076 if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
16079 if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
16080 // On AVX2, v4i64 -> v4i32 becomes VPERMD.
16081 if (Subtarget.hasInt256()) {
16082 static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
16083 In = DAG.getBitcast(MVT::v8i32, In);
16084 In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
16085 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
16086 DAG.getIntPtrConstant(0, DL));
16089 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16090 DAG.getIntPtrConstant(0, DL));
16091 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16092 DAG.getIntPtrConstant(2, DL));
16093 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16094 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16095 static const int ShufMask[] = {0, 2, 4, 6};
16096 return DAG.getVectorShuffle(VT, DL, OpLo, OpHi, ShufMask);
16099 if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
16100 // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
16101 if (Subtarget.hasInt256()) {
16102 In = DAG.getBitcast(MVT::v32i8, In);
16104 // The PSHUFB mask:
16105 static const int ShufMask1[] = { 0, 1, 4, 5, 8, 9, 12, 13,
16106 -1, -1, -1, -1, -1, -1, -1, -1,
16107 16, 17, 20, 21, 24, 25, 28, 29,
16108 -1, -1, -1, -1, -1, -1, -1, -1 };
16109 In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
16110 In = DAG.getBitcast(MVT::v4i64, In);
16112 static const int ShufMask2[] = {0, 2, -1, -1};
16113 In = DAG.getVectorShuffle(MVT::v4i64, DL, In, In, ShufMask2);
16114 In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
16115 DAG.getIntPtrConstant(0, DL));
16116 return DAG.getBitcast(VT, In);
16119 SDValue OpLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16120 DAG.getIntPtrConstant(0, DL));
16122 SDValue OpHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v4i32, In,
16123 DAG.getIntPtrConstant(4, DL));
16125 OpLo = DAG.getBitcast(MVT::v16i8, OpLo);
16126 OpHi = DAG.getBitcast(MVT::v16i8, OpHi);
16128 // The PSHUFB mask:
16129 static const int ShufMask1[] = {0, 1, 4, 5, 8, 9, 12, 13,
16130 -1, -1, -1, -1, -1, -1, -1, -1};
16132 OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
16133 OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
16135 OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
16136 OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
16138 // The MOVLHPS Mask:
16139 static const int ShufMask2[] = {0, 1, 4, 5};
16140 SDValue res = DAG.getVectorShuffle(MVT::v4i32, DL, OpLo, OpHi, ShufMask2);
16141 return DAG.getBitcast(MVT::v8i16, res);
16144 // Handle truncation of V256 to V128 using shuffles.
16145 if (!VT.is128BitVector() || !InVT.is256BitVector())
16148 assert(Subtarget.hasFp256() && "256-bit vector without AVX!");
16150 unsigned NumElems = VT.getVectorNumElements();
16151 MVT NVT = MVT::getVectorVT(VT.getVectorElementType(), NumElems * 2);
16153 SmallVector<int, 16> MaskVec(NumElems * 2, -1);
16154 // Prepare truncation shuffle mask
16155 for (unsigned i = 0; i != NumElems; ++i)
16156 MaskVec[i] = i * 2;
16157 In = DAG.getBitcast(NVT, In);
16158 SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
16159 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
16160 DAG.getIntPtrConstant(0, DL));
16163 SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
16164 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
16165 MVT VT = Op.getSimpleValueType();
16167 if (VT.isVector()) {
16168 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
16169 SDValue Src = Op.getOperand(0);
16171 if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
16172 return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
16173 DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
16174 DAG.getUNDEF(MVT::v2f32)));
16180 assert(!VT.isVector());
16182 std::pair<SDValue,SDValue> Vals = FP_TO_INTHelper(Op, DAG,
16183 IsSigned, /*IsReplace=*/ false);
16184 SDValue FIST = Vals.first, StackSlot = Vals.second;
16185 // If FP_TO_INTHelper failed, the node is actually supposed to be Legal.
16186 if (!FIST.getNode())
16189 if (StackSlot.getNode())
16190 // Load the result.
16191 return DAG.getLoad(VT, SDLoc(Op), FIST, StackSlot, MachinePointerInfo());
16193 // The node is the result.
16197 static SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) {
16199 MVT VT = Op.getSimpleValueType();
16200 SDValue In = Op.getOperand(0);
16201 MVT SVT = In.getSimpleValueType();
16203 assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
16205 return DAG.getNode(X86ISD::VFPEXT, DL, VT,
16206 DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
16207 In, DAG.getUNDEF(SVT)));
16210 /// The only differences between FABS and FNEG are the mask and the logic op.
16211 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
16212 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
16213 assert((Op.getOpcode() == ISD::FABS || Op.getOpcode() == ISD::FNEG) &&
16214 "Wrong opcode for lowering FABS or FNEG.");
16216 bool IsFABS = (Op.getOpcode() == ISD::FABS);
16218 // If this is a FABS and it has an FNEG user, bail out to fold the combination
16219 // into an FNABS. We'll lower the FABS after that if it is still in use.
16221 for (SDNode *User : Op->uses())
16222 if (User->getOpcode() == ISD::FNEG)
16226 MVT VT = Op.getSimpleValueType();
16228 bool IsF128 = (VT == MVT::f128);
16230 // FIXME: Use function attribute "OptimizeForSize" and/or CodeGenOpt::Level to
16231 // decide if we should generate a 16-byte constant mask when we only need 4 or
16232 // 8 bytes for the scalar case.
16237 if (VT.isVector()) {
16239 EltVT = VT.getVectorElementType();
16240 } else if (IsF128) {
16241 // SSE instructions are used for optimized f128 logical operations.
16242 LogicVT = MVT::f128;
16245 // There are no scalar bitwise logical SSE/AVX instructions, so we
16246 // generate a 16-byte vector constant and logic op even for the scalar case.
16247 // Using a 16-byte mask allows folding the load of the mask with
16248 // the logic op, so it can save (~4 bytes) on code size.
16249 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16253 unsigned EltBits = EltVT.getSizeInBits();
16254 // For FABS, mask is 0x7f...; for FNEG, mask is 0x80...
16256 IsFABS ? APInt::getSignedMaxValue(EltBits) : APInt::getSignMask(EltBits);
16257 const fltSemantics &Sem =
16258 EltVT == MVT::f64 ? APFloat::IEEEdouble() :
16259 (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16260 SDValue Mask = DAG.getConstantFP(APFloat(Sem, MaskElt), dl, LogicVT);
16262 SDValue Op0 = Op.getOperand(0);
16263 bool IsFNABS = !IsFABS && (Op0.getOpcode() == ISD::FABS);
16265 IsFABS ? X86ISD::FAND : IsFNABS ? X86ISD::FOR : X86ISD::FXOR;
16266 SDValue Operand = IsFNABS ? Op0.getOperand(0) : Op0;
16268 if (VT.isVector() || IsF128)
16269 return DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16271 // For the scalar case extend to a 128-bit vector, perform the logic op,
16272 // and extract the scalar result back out.
16273 Operand = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Operand);
16274 SDValue LogicNode = DAG.getNode(LogicOp, dl, LogicVT, Operand, Mask);
16275 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, LogicNode,
16276 DAG.getIntPtrConstant(0, dl));
16279 static SDValue LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) {
16280 SDValue Mag = Op.getOperand(0);
16281 SDValue Sign = Op.getOperand(1);
16284 // If the sign operand is smaller, extend it first.
16285 MVT VT = Op.getSimpleValueType();
16286 if (Sign.getSimpleValueType().bitsLT(VT))
16287 Sign = DAG.getNode(ISD::FP_EXTEND, dl, VT, Sign);
16289 // And if it is bigger, shrink it first.
16290 if (Sign.getSimpleValueType().bitsGT(VT))
16291 Sign = DAG.getNode(ISD::FP_ROUND, dl, VT, Sign, DAG.getIntPtrConstant(1, dl));
16293 // At this point the operands and the result should have the same
16294 // type, and that won't be f80 since that is not custom lowered.
16295 bool IsF128 = (VT == MVT::f128);
16296 assert((VT == MVT::f64 || VT == MVT::f32 || VT == MVT::f128 ||
16297 VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v4f32 ||
16298 VT == MVT::v8f32 || VT == MVT::v8f64 || VT == MVT::v16f32) &&
16299 "Unexpected type in LowerFCOPYSIGN");
16301 MVT EltVT = VT.getScalarType();
16302 const fltSemantics &Sem =
16303 EltVT == MVT::f64 ? APFloat::IEEEdouble()
16304 : (IsF128 ? APFloat::IEEEquad() : APFloat::IEEEsingle());
16306 // Perform all scalar logic operations as 16-byte vectors because there are no
16307 // scalar FP logic instructions in SSE.
16308 // TODO: This isn't necessary. If we used scalar types, we might avoid some
16309 // unnecessary splats, but we might miss load folding opportunities. Should
16310 // this decision be based on OptimizeForSize?
16311 bool IsFakeVector = !VT.isVector() && !IsF128;
16314 LogicVT = (VT == MVT::f64) ? MVT::v2f64 : MVT::v4f32;
16316 // The mask constants are automatically splatted for vector types.
16317 unsigned EltSizeInBits = VT.getScalarSizeInBits();
16318 SDValue SignMask = DAG.getConstantFP(
16319 APFloat(Sem, APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16320 SDValue MagMask = DAG.getConstantFP(
16321 APFloat(Sem, ~APInt::getSignMask(EltSizeInBits)), dl, LogicVT);
16323 // First, clear all bits but the sign bit from the second operand (sign).
16325 Sign = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Sign);
16326 SDValue SignBit = DAG.getNode(X86ISD::FAND, dl, LogicVT, Sign, SignMask);
16328 // Next, clear the sign bit from the first operand (magnitude).
16329 // TODO: If we had general constant folding for FP logic ops, this check
16330 // wouldn't be necessary.
16332 if (ConstantFPSDNode *Op0CN = dyn_cast<ConstantFPSDNode>(Mag)) {
16333 APFloat APF = Op0CN->getValueAPF();
16335 MagBits = DAG.getConstantFP(APF, dl, LogicVT);
16337 // If the magnitude operand wasn't a constant, we need to AND out the sign.
16339 Mag = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LogicVT, Mag);
16340 MagBits = DAG.getNode(X86ISD::FAND, dl, LogicVT, Mag, MagMask);
16343 // OR the magnitude value with the sign bit.
16344 SDValue Or = DAG.getNode(X86ISD::FOR, dl, LogicVT, MagBits, SignBit);
16345 return !IsFakeVector ? Or : DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Or,
16346 DAG.getIntPtrConstant(0, dl));
16349 static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
16350 SDValue N0 = Op.getOperand(0);
16352 MVT VT = Op.getSimpleValueType();
16354 MVT OpVT = N0.getSimpleValueType();
16355 assert((OpVT == MVT::f32 || OpVT == MVT::f64) &&
16356 "Unexpected type for FGETSIGN");
16358 // Lower ISD::FGETSIGN to (AND (X86ISD::MOVMSK ...) 1).
16359 MVT VecVT = (OpVT == MVT::f32 ? MVT::v4f32 : MVT::v2f64);
16360 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, N0);
16361 Res = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32, Res);
16362 Res = DAG.getZExtOrTrunc(Res, dl, VT);
16363 Res = DAG.getNode(ISD::AND, dl, VT, Res, DAG.getConstant(1, dl, VT));
16367 // Check whether an OR'd tree is PTEST-able.
16368 static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
16369 SelectionDAG &DAG) {
16370 assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
16372 if (!Subtarget.hasSSE41())
16375 if (!Op->hasOneUse())
16378 SDNode *N = Op.getNode();
16381 SmallVector<SDValue, 8> Opnds;
16382 DenseMap<SDValue, unsigned> VecInMap;
16383 SmallVector<SDValue, 8> VecIns;
16384 EVT VT = MVT::Other;
16386 // Recognize a special case where a vector is casted into wide integer to
16388 Opnds.push_back(N->getOperand(0));
16389 Opnds.push_back(N->getOperand(1));
16391 for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
16392 SmallVectorImpl<SDValue>::const_iterator I = Opnds.begin() + Slot;
16393 // BFS traverse all OR'd operands.
16394 if (I->getOpcode() == ISD::OR) {
16395 Opnds.push_back(I->getOperand(0));
16396 Opnds.push_back(I->getOperand(1));
16397 // Re-evaluate the number of nodes to be traversed.
16398 e += 2; // 2 more nodes (LHS and RHS) are pushed.
16402 // Quit if a non-EXTRACT_VECTOR_ELT
16403 if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16406 // Quit if without a constant index.
16407 SDValue Idx = I->getOperand(1);
16408 if (!isa<ConstantSDNode>(Idx))
16411 SDValue ExtractedFromVec = I->getOperand(0);
16412 DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
16413 if (M == VecInMap.end()) {
16414 VT = ExtractedFromVec.getValueType();
16415 // Quit if not 128/256-bit vector.
16416 if (!VT.is128BitVector() && !VT.is256BitVector())
16418 // Quit if not the same type.
16419 if (VecInMap.begin() != VecInMap.end() &&
16420 VT != VecInMap.begin()->first.getValueType())
16422 M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
16423 VecIns.push_back(ExtractedFromVec);
16425 M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
16428 assert((VT.is128BitVector() || VT.is256BitVector()) &&
16429 "Not extracted from 128-/256-bit vector.");
16431 unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
16433 for (DenseMap<SDValue, unsigned>::const_iterator
16434 I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
16435 // Quit if not all elements are used.
16436 if (I->second != FullMask)
16440 MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
16442 // Cast all vectors into TestVT for PTEST.
16443 for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
16444 VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
16446 // If more than one full vector is evaluated, OR them first before PTEST.
16447 for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
16448 // Each iteration will OR 2 nodes and append the result until there is only
16449 // 1 node left, i.e. the final OR'd value of all vectors.
16450 SDValue LHS = VecIns[Slot];
16451 SDValue RHS = VecIns[Slot + 1];
16452 VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
16455 return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
16458 /// \brief return true if \c Op has a use that doesn't just read flags.
16459 static bool hasNonFlagsUse(SDValue Op) {
16460 for (SDNode::use_iterator UI = Op->use_begin(), UE = Op->use_end(); UI != UE;
16462 SDNode *User = *UI;
16463 unsigned UOpNo = UI.getOperandNo();
16464 if (User->getOpcode() == ISD::TRUNCATE && User->hasOneUse()) {
16465 // Look pass truncate.
16466 UOpNo = User->use_begin().getOperandNo();
16467 User = *User->use_begin();
16470 if (User->getOpcode() != ISD::BRCOND && User->getOpcode() != ISD::SETCC &&
16471 !(User->getOpcode() == ISD::SELECT && UOpNo == 0))
16477 // Emit KTEST instruction for bit vectors on AVX-512
16478 static SDValue EmitKTEST(SDValue Op, SelectionDAG &DAG,
16479 const X86Subtarget &Subtarget) {
16480 if (Op.getOpcode() == ISD::BITCAST) {
16481 auto hasKTEST = [&](MVT VT) {
16482 unsigned SizeInBits = VT.getSizeInBits();
16483 return (Subtarget.hasDQI() && (SizeInBits == 8 || SizeInBits == 16)) ||
16484 (Subtarget.hasBWI() && (SizeInBits == 32 || SizeInBits == 64));
16486 SDValue Op0 = Op.getOperand(0);
16487 MVT Op0VT = Op0.getValueType().getSimpleVT();
16488 if (Op0VT.isVector() && Op0VT.getVectorElementType() == MVT::i1 &&
16490 return DAG.getNode(X86ISD::KTEST, SDLoc(Op), Op0VT, Op0, Op0);
16495 /// Emit nodes that will be selected as "test Op0,Op0", or something
16497 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
16498 SelectionDAG &DAG) const {
16499 if (Op.getValueType() == MVT::i1) {
16500 SDValue ExtOp = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, Op);
16501 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, ExtOp,
16502 DAG.getConstant(0, dl, MVT::i8));
16504 // CF and OF aren't always set the way we want. Determine which
16505 // of these we need.
16506 bool NeedCF = false;
16507 bool NeedOF = false;
16510 case X86::COND_A: case X86::COND_AE:
16511 case X86::COND_B: case X86::COND_BE:
16514 case X86::COND_G: case X86::COND_GE:
16515 case X86::COND_L: case X86::COND_LE:
16516 case X86::COND_O: case X86::COND_NO: {
16517 // Check if we really need to set the
16518 // Overflow flag. If NoSignedWrap is present
16519 // that is not actually needed.
16520 switch (Op->getOpcode()) {
16525 if (Op.getNode()->getFlags().hasNoSignedWrap())
16535 // See if we can use the EFLAGS value from the operand instead of
16536 // doing a separate TEST. TEST always sets OF and CF to 0, so unless
16537 // we prove that the arithmetic won't overflow, we can't use OF or CF.
16538 if (Op.getResNo() != 0 || NeedOF || NeedCF) {
16539 // Emit KTEST for bit vectors
16540 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16542 // Emit a CMP with 0, which is the TEST pattern.
16543 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16544 DAG.getConstant(0, dl, Op.getValueType()));
16546 unsigned Opcode = 0;
16547 unsigned NumOperands = 0;
16549 // Truncate operations may prevent the merge of the SETCC instruction
16550 // and the arithmetic instruction before it. Attempt to truncate the operands
16551 // of the arithmetic instruction and use a reduced bit-width instruction.
16552 bool NeedTruncation = false;
16553 SDValue ArithOp = Op;
16554 if (Op->getOpcode() == ISD::TRUNCATE && Op->hasOneUse()) {
16555 SDValue Arith = Op->getOperand(0);
16556 // Both the trunc and the arithmetic op need to have one user each.
16557 if (Arith->hasOneUse())
16558 switch (Arith.getOpcode()) {
16565 NeedTruncation = true;
16571 // Sometimes flags can be set either with an AND or with an SRL/SHL
16572 // instruction. SRL/SHL variant should be preferred for masks longer than this
16574 const int ShiftToAndMaxMaskWidth = 32;
16575 const bool ZeroCheck = (X86CC == X86::COND_E || X86CC == X86::COND_NE);
16577 // NOTICE: In the code below we use ArithOp to hold the arithmetic operation
16578 // which may be the result of a CAST. We use the variable 'Op', which is the
16579 // non-casted variable when we check for possible users.
16580 switch (ArithOp.getOpcode()) {
16582 // Due to an isel shortcoming, be conservative if this add is likely to be
16583 // selected as part of a load-modify-store instruction. When the root node
16584 // in a match is a store, isel doesn't know how to remap non-chain non-flag
16585 // uses of other nodes in the match, such as the ADD in this case. This
16586 // leads to the ADD being left around and reselected, with the result being
16587 // two adds in the output. Alas, even if none our users are stores, that
16588 // doesn't prove we're O.K. Ergo, if we have any parents that aren't
16589 // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
16590 // climbing the DAG back to the root, and it doesn't seem to be worth the
16592 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16593 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16594 if (UI->getOpcode() != ISD::CopyToReg &&
16595 UI->getOpcode() != ISD::SETCC &&
16596 UI->getOpcode() != ISD::STORE)
16599 if (ConstantSDNode *C =
16600 dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
16601 // An add of one will be selected as an INC.
16602 if (C->isOne() && !Subtarget.slowIncDec()) {
16603 Opcode = X86ISD::INC;
16608 // An add of negative one (subtract of one) will be selected as a DEC.
16609 if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
16610 Opcode = X86ISD::DEC;
16616 // Otherwise use a regular EFLAGS-setting add.
16617 Opcode = X86ISD::ADD;
16622 // If we have a constant logical shift that's only used in a comparison
16623 // against zero turn it into an equivalent AND. This allows turning it into
16624 // a TEST instruction later.
16625 if (ZeroCheck && Op->hasOneUse() &&
16626 isa<ConstantSDNode>(Op->getOperand(1)) && !hasNonFlagsUse(Op)) {
16627 EVT VT = Op.getValueType();
16628 unsigned BitWidth = VT.getSizeInBits();
16629 unsigned ShAmt = Op->getConstantOperandVal(1);
16630 if (ShAmt >= BitWidth) // Avoid undefined shifts.
16632 APInt Mask = ArithOp.getOpcode() == ISD::SRL
16633 ? APInt::getHighBitsSet(BitWidth, BitWidth - ShAmt)
16634 : APInt::getLowBitsSet(BitWidth, BitWidth - ShAmt);
16635 if (!Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16637 Op = DAG.getNode(ISD::AND, dl, VT, Op->getOperand(0),
16638 DAG.getConstant(Mask, dl, VT));
16643 // If the primary 'and' result isn't used, don't bother using X86ISD::AND,
16644 // because a TEST instruction will be better. However, AND should be
16645 // preferred if the instruction can be combined into ANDN.
16646 if (!hasNonFlagsUse(Op)) {
16647 SDValue Op0 = ArithOp->getOperand(0);
16648 SDValue Op1 = ArithOp->getOperand(1);
16649 EVT VT = ArithOp.getValueType();
16650 bool isAndn = isBitwiseNot(Op0) || isBitwiseNot(Op1);
16651 bool isLegalAndnType = VT == MVT::i32 || VT == MVT::i64;
16652 bool isProperAndn = isAndn && isLegalAndnType && Subtarget.hasBMI();
16654 // If we cannot select an ANDN instruction, check if we can replace
16655 // AND+IMM64 with a shift before giving up. This is possible for masks
16656 // like 0xFF000000 or 0x00FFFFFF and if we care only about the zero flag.
16657 if (!isProperAndn) {
16661 assert(!isa<ConstantSDNode>(Op0) && "AND node isn't canonicalized");
16662 auto *CN = dyn_cast<ConstantSDNode>(Op1);
16666 const APInt &Mask = CN->getAPIntValue();
16667 if (Mask.isSignedIntN(ShiftToAndMaxMaskWidth))
16668 break; // Prefer TEST instruction.
16670 unsigned BitWidth = Mask.getBitWidth();
16671 unsigned LeadingOnes = Mask.countLeadingOnes();
16672 unsigned TrailingZeros = Mask.countTrailingZeros();
16674 if (LeadingOnes + TrailingZeros == BitWidth) {
16675 assert(TrailingZeros < VT.getSizeInBits() &&
16676 "Shift amount should be less than the type width");
16677 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16678 SDValue ShAmt = DAG.getConstant(TrailingZeros, dl, ShTy);
16679 Op = DAG.getNode(ISD::SRL, dl, VT, Op0, ShAmt);
16683 unsigned LeadingZeros = Mask.countLeadingZeros();
16684 unsigned TrailingOnes = Mask.countTrailingOnes();
16686 if (LeadingZeros + TrailingOnes == BitWidth) {
16687 assert(LeadingZeros < VT.getSizeInBits() &&
16688 "Shift amount should be less than the type width");
16689 MVT ShTy = getScalarShiftAmountTy(DAG.getDataLayout(), VT);
16690 SDValue ShAmt = DAG.getConstant(LeadingZeros, dl, ShTy);
16691 Op = DAG.getNode(ISD::SHL, dl, VT, Op0, ShAmt);
16702 // Due to the ISEL shortcoming noted above, be conservative if this op is
16703 // likely to be selected as part of a load-modify-store instruction.
16704 for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
16705 UE = Op.getNode()->use_end(); UI != UE; ++UI)
16706 if (UI->getOpcode() == ISD::STORE)
16709 // Otherwise use a regular EFLAGS-setting instruction.
16710 switch (ArithOp.getOpcode()) {
16711 default: llvm_unreachable("unexpected operator!");
16712 case ISD::SUB: Opcode = X86ISD::SUB; break;
16713 case ISD::XOR: Opcode = X86ISD::XOR; break;
16714 case ISD::AND: Opcode = X86ISD::AND; break;
16716 if (!NeedTruncation && ZeroCheck) {
16717 if (SDValue EFLAGS = LowerVectorAllZeroTest(Op, Subtarget, DAG))
16720 Opcode = X86ISD::OR;
16734 return SDValue(Op.getNode(), 1);
16740 // If we found that truncation is beneficial, perform the truncation and
16742 if (NeedTruncation) {
16743 EVT VT = Op.getValueType();
16744 SDValue WideVal = Op->getOperand(0);
16745 EVT WideVT = WideVal.getValueType();
16746 unsigned ConvertedOp = 0;
16747 // Use a target machine opcode to prevent further DAGCombine
16748 // optimizations that may separate the arithmetic operations
16749 // from the setcc node.
16750 switch (WideVal.getOpcode()) {
16752 case ISD::ADD: ConvertedOp = X86ISD::ADD; break;
16753 case ISD::SUB: ConvertedOp = X86ISD::SUB; break;
16754 case ISD::AND: ConvertedOp = X86ISD::AND; break;
16755 case ISD::OR: ConvertedOp = X86ISD::OR; break;
16756 case ISD::XOR: ConvertedOp = X86ISD::XOR; break;
16760 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16761 if (TLI.isOperationLegal(WideVal.getOpcode(), WideVT)) {
16762 SDValue V0 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(0));
16763 SDValue V1 = DAG.getNode(ISD::TRUNCATE, dl, VT, WideVal.getOperand(1));
16764 Op = DAG.getNode(ConvertedOp, dl, VT, V0, V1);
16770 // Emit KTEST for bit vectors
16771 if (auto Node = EmitKTEST(Op, DAG, Subtarget))
16774 // Emit a CMP with 0, which is the TEST pattern.
16775 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
16776 DAG.getConstant(0, dl, Op.getValueType()));
16778 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
16779 SmallVector<SDValue, 4> Ops(Op->op_begin(), Op->op_begin() + NumOperands);
16781 SDValue New = DAG.getNode(Opcode, dl, VTs, Ops);
16782 DAG.ReplaceAllUsesWith(Op, New);
16783 return SDValue(New.getNode(), 1);
16786 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
16788 SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
16789 const SDLoc &dl, SelectionDAG &DAG) const {
16790 if (isNullConstant(Op1))
16791 return EmitTest(Op0, X86CC, dl, DAG);
16793 assert(!(isa<ConstantSDNode>(Op1) && Op0.getValueType() == MVT::i1) &&
16794 "Unexpected comparison operation for MVT::i1 operands");
16796 if ((Op0.getValueType() == MVT::i8 || Op0.getValueType() == MVT::i16 ||
16797 Op0.getValueType() == MVT::i32 || Op0.getValueType() == MVT::i64)) {
16798 // Only promote the compare up to I32 if it is a 16 bit operation
16799 // with an immediate. 16 bit immediates are to be avoided.
16800 if ((Op0.getValueType() == MVT::i16 &&
16801 (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
16802 !DAG.getMachineFunction().getFunction()->optForMinSize() &&
16803 !Subtarget.isAtom()) {
16804 unsigned ExtendOp =
16805 isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
16806 Op0 = DAG.getNode(ExtendOp, dl, MVT::i32, Op0);
16807 Op1 = DAG.getNode(ExtendOp, dl, MVT::i32, Op1);
16809 // Use SUB instead of CMP to enable CSE between SUB and CMP.
16810 SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
16811 SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
16813 return SDValue(Sub.getNode(), 1);
16815 return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
16818 /// Convert a comparison if required by the subtarget.
16819 SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
16820 SelectionDAG &DAG) const {
16821 // If the subtarget does not support the FUCOMI instruction, floating-point
16822 // comparisons have to be converted.
16823 if (Subtarget.hasCMov() ||
16824 Cmp.getOpcode() != X86ISD::CMP ||
16825 !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
16826 !Cmp.getOperand(1).getValueType().isFloatingPoint())
16829 // The instruction selector will select an FUCOM instruction instead of
16830 // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
16831 // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
16832 // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
16834 SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
16835 SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
16836 SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
16837 DAG.getConstant(8, dl, MVT::i8));
16838 SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
16840 // Some 64-bit targets lack SAHF support, but they do support FCOMI.
16841 assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
16842 return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
16845 /// Check if replacement of SQRT with RSQRT should be disabled.
16846 bool X86TargetLowering::isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const {
16847 EVT VT = Op.getValueType();
16849 // We never want to use both SQRT and RSQRT instructions for the same input.
16850 if (DAG.getNodeIfExists(X86ISD::FRSQRT, DAG.getVTList(VT), Op))
16854 return Subtarget.hasFastVectorFSQRT();
16855 return Subtarget.hasFastScalarFSQRT();
16858 /// The minimum architected relative accuracy is 2^-12. We need one
16859 /// Newton-Raphson step to have a good float result (24 bits of precision).
16860 SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
16861 SelectionDAG &DAG, int Enabled,
16862 int &RefinementSteps,
16863 bool &UseOneConstNR,
16864 bool Reciprocal) const {
16865 EVT VT = Op.getValueType();
16867 // SSE1 has rsqrtss and rsqrtps. AVX adds a 256-bit variant for rsqrtps.
16868 // TODO: Add support for AVX512 (v16f32).
16869 // It is likely not profitable to do this for f64 because a double-precision
16870 // rsqrt estimate with refinement on x86 prior to FMA requires at least 16
16871 // instructions: convert to single, rsqrtss, convert back to double, refine
16872 // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
16873 // along with FMA, this could be a throughput win.
16874 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16875 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16876 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16877 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16878 RefinementSteps = 1;
16880 UseOneConstNR = false;
16881 return DAG.getNode(X86ISD::FRSQRT, SDLoc(Op), VT, Op);
16886 /// The minimum architected relative accuracy is 2^-12. We need one
16887 /// Newton-Raphson step to have a good float result (24 bits of precision).
16888 SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
16890 int &RefinementSteps) const {
16891 EVT VT = Op.getValueType();
16893 // SSE1 has rcpss and rcpps. AVX adds a 256-bit variant for rcpps.
16894 // TODO: Add support for AVX512 (v16f32).
16895 // It is likely not profitable to do this for f64 because a double-precision
16896 // reciprocal estimate with refinement on x86 prior to FMA requires
16897 // 15 instructions: convert to single, rcpss, convert back to double, refine
16898 // (3 steps = 12 insts). If an 'rcpsd' variant was added to the ISA
16899 // along with FMA, this could be a throughput win.
16901 if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
16902 (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
16903 (VT == MVT::v8f32 && Subtarget.hasAVX())) {
16904 // Enable estimate codegen with 1 refinement step for vector division.
16905 // Scalar division estimates are disabled because they break too much
16906 // real-world code. These defaults are intended to match GCC behavior.
16907 if (VT == MVT::f32 && Enabled == ReciprocalEstimate::Unspecified)
16910 if (RefinementSteps == ReciprocalEstimate::Unspecified)
16911 RefinementSteps = 1;
16913 return DAG.getNode(X86ISD::FRCP, SDLoc(Op), VT, Op);
16918 /// If we have at least two divisions that use the same divisor, convert to
16919 /// multiplication by a reciprocal. This may need to be adjusted for a given
16920 /// CPU if a division's cost is not at least twice the cost of a multiplication.
16921 /// This is because we still need one division to calculate the reciprocal and
16922 /// then we need two multiplies by that reciprocal as replacements for the
16923 /// original divisions.
16924 unsigned X86TargetLowering::combineRepeatedFPDivisors() const {
16928 /// Helper for creating a X86ISD::SETCC node.
16929 static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
16930 SelectionDAG &DAG) {
16931 return DAG.getNode(X86ISD::SETCC, dl, MVT::i8,
16932 DAG.getConstant(Cond, dl, MVT::i8), EFLAGS);
16935 /// Create a BT (Bit Test) node - Test bit \p BitNo in \p Src and set condition
16936 /// according to equal/not-equal condition code \p CC.
16937 static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
16938 const SDLoc &dl, SelectionDAG &DAG) {
16939 // If Src is i8, promote it to i32 with any_extend. There is no i8 BT
16940 // instruction. Since the shift amount is in-range-or-undefined, we know
16941 // that doing a bittest on the i32 value is ok. We extend to i32 because
16942 // the encoding for the i16 version is larger than the i32 version.
16943 // Also promote i16 to i32 for performance / code size reason.
16944 if (Src.getValueType() == MVT::i8 || Src.getValueType() == MVT::i16)
16945 Src = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Src);
16947 // See if we can use the 32-bit instruction instead of the 64-bit one for a
16948 // shorter encoding. Since the former takes the modulo 32 of BitNo and the
16949 // latter takes the modulo 64, this is only valid if the 5th bit of BitNo is
16950 // known to be zero.
16951 if (Src.getValueType() == MVT::i64 &&
16952 DAG.MaskedValueIsZero(BitNo, APInt(BitNo.getValueSizeInBits(), 32)))
16953 Src = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Src);
16955 // If the operand types disagree, extend the shift amount to match. Since
16956 // BT ignores high bits (like shifts) we can use anyextend.
16957 if (Src.getValueType() != BitNo.getValueType())
16958 BitNo = DAG.getNode(ISD::ANY_EXTEND, dl, Src.getValueType(), BitNo);
16960 SDValue BT = DAG.getNode(X86ISD::BT, dl, MVT::i32, Src, BitNo);
16961 X86::CondCode Cond = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
16962 return getSETCC(Cond, BT, dl , DAG);
16965 /// Result of 'and' is compared against zero. Change to a BT node if possible.
16966 static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
16967 const SDLoc &dl, SelectionDAG &DAG) {
16968 SDValue Op0 = And.getOperand(0);
16969 SDValue Op1 = And.getOperand(1);
16970 if (Op0.getOpcode() == ISD::TRUNCATE)
16971 Op0 = Op0.getOperand(0);
16972 if (Op1.getOpcode() == ISD::TRUNCATE)
16973 Op1 = Op1.getOperand(0);
16976 if (Op1.getOpcode() == ISD::SHL)
16977 std::swap(Op0, Op1);
16978 if (Op0.getOpcode() == ISD::SHL) {
16979 if (isOneConstant(Op0.getOperand(0))) {
16980 // If we looked past a truncate, check that it's only truncating away
16982 unsigned BitWidth = Op0.getValueSizeInBits();
16983 unsigned AndBitWidth = And.getValueSizeInBits();
16984 if (BitWidth > AndBitWidth) {
16986 DAG.computeKnownBits(Op0, Known);
16987 if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth)
16991 RHS = Op0.getOperand(1);
16993 } else if (Op1.getOpcode() == ISD::Constant) {
16994 ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
16995 uint64_t AndRHSVal = AndRHS->getZExtValue();
16996 SDValue AndLHS = Op0;
16998 if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
16999 LHS = AndLHS.getOperand(0);
17000 RHS = AndLHS.getOperand(1);
17003 // Use BT if the immediate can't be encoded in a TEST instruction.
17004 if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
17006 RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), dl, LHS.getValueType());
17011 return getBitTestCondition(LHS, RHS, CC, dl, DAG);
17016 // Convert (truncate (srl X, N) to i1) to (bt X, N)
17017 static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
17018 const SDLoc &dl, SelectionDAG &DAG) {
17020 assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
17021 "Expected TRUNCATE to i1 node");
17023 if (Op.getOperand(0).getOpcode() != ISD::SRL)
17026 SDValue ShiftRight = Op.getOperand(0);
17027 return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
17031 /// Result of 'and' or 'trunc to i1' is compared against zero.
17032 /// Change to a BT node if possible.
17033 SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
17034 const SDLoc &dl, SelectionDAG &DAG) const {
17035 if (Op.getOpcode() == ISD::AND)
17036 return LowerAndToBT(Op, CC, dl, DAG);
17037 if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
17038 return LowerTruncateToBT(Op, CC, dl, DAG);
17042 /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
17044 static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
17049 // SSE Condition code mapping:
17058 switch (SetCCOpcode) {
17059 default: llvm_unreachable("Unexpected SETCC condition");
17061 case ISD::SETEQ: SSECC = 0; break;
17063 case ISD::SETGT: Swap = true; LLVM_FALLTHROUGH;
17065 case ISD::SETOLT: SSECC = 1; break;
17067 case ISD::SETGE: Swap = true; LLVM_FALLTHROUGH;
17069 case ISD::SETOLE: SSECC = 2; break;
17070 case ISD::SETUO: SSECC = 3; break;
17072 case ISD::SETNE: SSECC = 4; break;
17073 case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH;
17074 case ISD::SETUGE: SSECC = 5; break;
17075 case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
17076 case ISD::SETUGT: SSECC = 6; break;
17077 case ISD::SETO: SSECC = 7; break;
17079 case ISD::SETONE: SSECC = 8; break;
17082 std::swap(Op0, Op1);
17087 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
17088 /// concatenate the result back.
17089 static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
17090 MVT VT = Op.getSimpleValueType();
17092 assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
17093 "Unsupported value type for operation");
17095 unsigned NumElems = VT.getVectorNumElements();
17097 SDValue CC = Op.getOperand(2);
17099 // Extract the LHS vectors
17100 SDValue LHS = Op.getOperand(0);
17101 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
17102 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
17104 // Extract the RHS vectors
17105 SDValue RHS = Op.getOperand(1);
17106 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
17107 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
17109 // Issue the operation on the smaller types and concatenate the result back
17110 MVT EltVT = VT.getVectorElementType();
17111 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
17112 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
17113 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
17114 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
17117 static SDValue LowerBoolVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17118 SDValue Op0 = Op.getOperand(0);
17119 SDValue Op1 = Op.getOperand(1);
17120 SDValue CC = Op.getOperand(2);
17121 MVT VT = Op.getSimpleValueType();
17124 assert(Op0.getSimpleValueType().getVectorElementType() == MVT::i1 &&
17125 "Unexpected type for boolean compare operation");
17126 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17127 SDValue NotOp0 = DAG.getNode(ISD::XOR, dl, VT, Op0,
17128 DAG.getConstant(-1, dl, VT));
17129 SDValue NotOp1 = DAG.getNode(ISD::XOR, dl, VT, Op1,
17130 DAG.getConstant(-1, dl, VT));
17131 switch (SetCCOpcode) {
17132 default: llvm_unreachable("Unexpected SETCC condition");
17134 // (x == y) -> ~(x ^ y)
17135 return DAG.getNode(ISD::XOR, dl, VT,
17136 DAG.getNode(ISD::XOR, dl, VT, Op0, Op1),
17137 DAG.getConstant(-1, dl, VT));
17139 // (x != y) -> (x ^ y)
17140 return DAG.getNode(ISD::XOR, dl, VT, Op0, Op1);
17143 // (x > y) -> (x & ~y)
17144 return DAG.getNode(ISD::AND, dl, VT, Op0, NotOp1);
17147 // (x < y) -> (~x & y)
17148 return DAG.getNode(ISD::AND, dl, VT, NotOp0, Op1);
17151 // (x <= y) -> (~x | y)
17152 return DAG.getNode(ISD::OR, dl, VT, NotOp0, Op1);
17155 // (x >=y) -> (x | ~y)
17156 return DAG.getNode(ISD::OR, dl, VT, Op0, NotOp1);
17160 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
17162 SDValue Op0 = Op.getOperand(0);
17163 SDValue Op1 = Op.getOperand(1);
17164 SDValue CC = Op.getOperand(2);
17165 MVT VT = Op.getSimpleValueType();
17168 assert(VT.getVectorElementType() == MVT::i1 &&
17169 "Cannot set masked compare for this operation");
17171 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
17173 bool Unsigned = false;
17176 switch (SetCCOpcode) {
17177 default: llvm_unreachable("Unexpected SETCC condition");
17178 case ISD::SETNE: SSECC = 4; break;
17179 case ISD::SETEQ: Opc = X86ISD::PCMPEQM; break;
17180 case ISD::SETUGT: SSECC = 6; Unsigned = true; break;
17181 case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH;
17182 case ISD::SETGT: Opc = X86ISD::PCMPGTM; break;
17183 case ISD::SETULT: SSECC = 1; Unsigned = true; break;
17184 case ISD::SETUGE: SSECC = 5; Unsigned = true; break; //NLT
17185 case ISD::SETGE: Swap = true; SSECC = 2; break; // LE + swap
17186 case ISD::SETULE: Unsigned = true; LLVM_FALLTHROUGH;
17187 case ISD::SETLE: SSECC = 2; break;
17191 std::swap(Op0, Op1);
17193 return DAG.getNode(Opc, dl, VT, Op0, Op1);
17194 Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
17195 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17196 DAG.getConstant(SSECC, dl, MVT::i8));
17199 /// \brief Try to turn a VSETULT into a VSETULE by modifying its second
17200 /// operand \p Op1. If non-trivial (for example because it's not constant)
17201 /// return an empty value.
17202 static SDValue ChangeVSETULTtoVSETULE(const SDLoc &dl, SDValue Op1,
17203 SelectionDAG &DAG) {
17204 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1.getNode());
17208 MVT VT = Op1.getSimpleValueType();
17209 MVT EVT = VT.getVectorElementType();
17210 unsigned n = VT.getVectorNumElements();
17211 SmallVector<SDValue, 8> ULTOp1;
17213 for (unsigned i = 0; i < n; ++i) {
17214 ConstantSDNode *Elt = dyn_cast<ConstantSDNode>(BV->getOperand(i));
17215 if (!Elt || Elt->isOpaque() || Elt->getSimpleValueType(0) != EVT)
17218 // Avoid underflow.
17219 APInt Val = Elt->getAPIntValue();
17223 ULTOp1.push_back(DAG.getConstant(Val - 1, dl, EVT));
17226 return DAG.getBuildVector(VT, dl, ULTOp1);
17229 static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
17230 SelectionDAG &DAG) {
17231 SDValue Op0 = Op.getOperand(0);
17232 SDValue Op1 = Op.getOperand(1);
17233 SDValue CC = Op.getOperand(2);
17234 MVT VT = Op.getSimpleValueType();
17235 ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
17236 bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
17241 MVT EltVT = Op0.getSimpleValueType().getVectorElementType();
17242 assert(EltVT == MVT::f32 || EltVT == MVT::f64);
17246 if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
17247 assert(VT.getVectorNumElements() <= 16);
17248 Opc = X86ISD::CMPM;
17250 Opc = X86ISD::CMPP;
17251 // The SSE/AVX packed FP comparison nodes are defined with a
17252 // floating-point vector result that matches the operand type. This allows
17253 // them to work with an SSE1 target (integer vector types are not legal).
17254 VT = Op0.getSimpleValueType();
17257 // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
17258 // emit two comparisons and a logic op to tie them together.
17259 // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
17262 unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
17264 // LLVM predicate is SETUEQ or SETONE.
17266 unsigned CombineOpc;
17267 if (Cond == ISD::SETUEQ) {
17270 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
17271 static_cast<unsigned>(ISD::OR);
17273 assert(Cond == ISD::SETONE);
17276 CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
17277 static_cast<unsigned>(ISD::AND);
17280 SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17281 DAG.getConstant(CC0, dl, MVT::i8));
17282 SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
17283 DAG.getConstant(CC1, dl, MVT::i8));
17284 Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
17286 // Handle all other FP comparisons here.
17287 Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
17288 DAG.getConstant(SSECC, dl, MVT::i8));
17291 // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
17292 // result type of SETCC. The bitcast is expected to be optimized away
17293 // during combining/isel.
17294 if (Opc == X86ISD::CMPP)
17295 Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
17300 MVT VTOp0 = Op0.getSimpleValueType();
17301 assert(VTOp0 == Op1.getSimpleValueType() &&
17302 "Expected operands with same type!");
17303 assert(VT.getVectorNumElements() == VTOp0.getVectorNumElements() &&
17304 "Invalid number of packed elements for source and destination!");
17306 if (VT.is128BitVector() && VTOp0.is256BitVector()) {
17307 // On non-AVX512 targets, a vector of MVT::i1 is promoted by the type
17308 // legalizer to a wider vector type. In the case of 'vsetcc' nodes, the
17309 // legalizer firstly checks if the first operand in input to the setcc has
17310 // a legal type. If so, then it promotes the return type to that same type.
17311 // Otherwise, the return type is promoted to the 'next legal type' which,
17312 // for a vector of MVT::i1 is always a 128-bit integer vector type.
17314 // We reach this code only if the following two conditions are met:
17315 // 1. Both return type and operand type have been promoted to wider types
17316 // by the type legalizer.
17317 // 2. The original operand type has been promoted to a 256-bit vector.
17319 // Note that condition 2. only applies for AVX targets.
17320 SDValue NewOp = DAG.getSetCC(dl, VTOp0, Op0, Op1, Cond);
17321 return DAG.getZExtOrTrunc(NewOp, dl, VT);
17324 // The non-AVX512 code below works under the assumption that source and
17325 // destination types are the same.
17326 assert((Subtarget.hasAVX512() || (VT == VTOp0)) &&
17327 "Value types for source and destination must be the same!");
17329 // Break 256-bit integer vector compare into smaller ones.
17330 if (VT.is256BitVector() && !Subtarget.hasInt256())
17331 return Lower256IntVSETCC(Op, DAG);
17333 // Operands are boolean (vectors of i1)
17334 MVT OpVT = Op1.getSimpleValueType();
17335 if (OpVT.getVectorElementType() == MVT::i1)
17336 return LowerBoolVSETCC_AVX512(Op, DAG);
17338 // The result is boolean, but operands are int/float
17339 if (VT.getVectorElementType() == MVT::i1) {
17340 // In AVX-512 architecture setcc returns mask with i1 elements,
17341 // But there is no compare instruction for i8 and i16 elements in KNL.
17342 // In this case use SSE compare
17343 bool UseAVX512Inst =
17344 (OpVT.is512BitVector() ||
17345 OpVT.getScalarSizeInBits() >= 32 ||
17346 (Subtarget.hasBWI() && Subtarget.hasVLX()));
17349 return LowerIntVSETCC_AVX512(Op, DAG);
17351 return DAG.getNode(ISD::TRUNCATE, dl, VT,
17352 DAG.getNode(ISD::SETCC, dl, OpVT, Op0, Op1, CC));
17355 // Lower using XOP integer comparisons.
17356 if ((VT == MVT::v16i8 || VT == MVT::v8i16 ||
17357 VT == MVT::v4i32 || VT == MVT::v2i64) && Subtarget.hasXOP()) {
17358 // Translate compare code to XOP PCOM compare mode.
17359 unsigned CmpMode = 0;
17361 default: llvm_unreachable("Unexpected SETCC condition");
17363 case ISD::SETLT: CmpMode = 0x00; break;
17365 case ISD::SETLE: CmpMode = 0x01; break;
17367 case ISD::SETGT: CmpMode = 0x02; break;
17369 case ISD::SETGE: CmpMode = 0x03; break;
17370 case ISD::SETEQ: CmpMode = 0x04; break;
17371 case ISD::SETNE: CmpMode = 0x05; break;
17374 // Are we comparing unsigned or signed integers?
17376 ISD::isUnsignedIntSetCC(Cond) ? X86ISD::VPCOMU : X86ISD::VPCOM;
17378 return DAG.getNode(Opc, dl, VT, Op0, Op1,
17379 DAG.getConstant(CmpMode, dl, MVT::i8));
17382 // We are handling one of the integer comparisons here. Since SSE only has
17383 // GT and EQ comparisons for integer, swapping operands and multiple
17384 // operations may be required for some comparisons.
17385 unsigned Opc = (Cond == ISD::SETEQ || Cond == ISD::SETNE) ? X86ISD::PCMPEQ
17387 bool Swap = Cond == ISD::SETLT || Cond == ISD::SETULT ||
17388 Cond == ISD::SETGE || Cond == ISD::SETUGE;
17389 bool Invert = Cond == ISD::SETNE ||
17390 (Cond != ISD::SETEQ && ISD::isTrueWhenEqual(Cond));
17392 // If both operands are known non-negative, then an unsigned compare is the
17393 // same as a signed compare and there's no need to flip signbits.
17394 // TODO: We could check for more general simplifications here since we're
17395 // computing known bits.
17396 bool FlipSigns = ISD::isUnsignedIntSetCC(Cond) &&
17397 !(DAG.SignBitIsZero(Op0) && DAG.SignBitIsZero(Op1));
17399 // Special case: Use min/max operations for SETULE/SETUGE
17400 MVT VET = VT.getVectorElementType();
17402 (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
17403 (Subtarget.hasSSE2() && (VET == MVT::i8));
17404 bool MinMax = false;
17408 case ISD::SETULE: Opc = ISD::UMIN; MinMax = true; break;
17409 case ISD::SETUGE: Opc = ISD::UMAX; MinMax = true; break;
17413 Swap = Invert = FlipSigns = false;
17416 bool HasSubus = Subtarget.hasSSE2() && (VET == MVT::i8 || VET == MVT::i16);
17417 bool Subus = false;
17418 if (!MinMax && HasSubus) {
17419 // As another special case, use PSUBUS[BW] when it's profitable. E.g. for
17421 // t = psubus Op0, Op1
17422 // pcmpeq t, <0..0>
17425 case ISD::SETULT: {
17426 // If the comparison is against a constant we can turn this into a
17427 // setule. With psubus, setule does not require a swap. This is
17428 // beneficial because the constant in the register is no longer
17429 // destructed as the destination so it can be hoisted out of a loop.
17430 // Only do this pre-AVX since vpcmp* is no longer destructive.
17431 if (Subtarget.hasAVX())
17433 if (SDValue ULEOp1 = ChangeVSETULTtoVSETULE(dl, Op1, DAG)) {
17435 Subus = true; Invert = false; Swap = false;
17439 // Psubus is better than flip-sign because it requires no inversion.
17440 case ISD::SETUGE: Subus = true; Invert = false; Swap = true; break;
17441 case ISD::SETULE: Subus = true; Invert = false; Swap = false; break;
17445 Opc = X86ISD::SUBUS;
17451 std::swap(Op0, Op1);
17453 // Check that the operation in question is available (most are plain SSE2,
17454 // but PCMPGTQ and PCMPEQQ have different requirements).
17455 if (VT == MVT::v2i64) {
17456 if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
17457 assert(Subtarget.hasSSE2() && "Don't know how to lower!");
17459 // First cast everything to the right type.
17460 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17461 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17463 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17464 // bits of the inputs before performing those operations. The lower
17465 // compare is always unsigned.
17468 SB = DAG.getConstant(0x80000000U, dl, MVT::v4i32);
17470 SDValue Sign = DAG.getConstant(0x80000000U, dl, MVT::i32);
17471 SDValue Zero = DAG.getConstant(0x00000000U, dl, MVT::i32);
17472 SB = DAG.getBuildVector(MVT::v4i32, dl, {Sign, Zero, Sign, Zero});
17474 Op0 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op0, SB);
17475 Op1 = DAG.getNode(ISD::XOR, dl, MVT::v4i32, Op1, SB);
17477 // Emulate PCMPGTQ with (hi1 > hi2) | ((hi1 == hi2) & (lo1 > lo2))
17478 SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
17479 SDValue EQ = DAG.getNode(X86ISD::PCMPEQ, dl, MVT::v4i32, Op0, Op1);
17481 // Create masks for only the low parts/high parts of the 64 bit integers.
17482 static const int MaskHi[] = { 1, 1, 3, 3 };
17483 static const int MaskLo[] = { 0, 0, 2, 2 };
17484 SDValue EQHi = DAG.getVectorShuffle(MVT::v4i32, dl, EQ, EQ, MaskHi);
17485 SDValue GTLo = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskLo);
17486 SDValue GTHi = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
17488 SDValue Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, EQHi, GTLo);
17489 Result = DAG.getNode(ISD::OR, dl, MVT::v4i32, Result, GTHi);
17492 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17494 return DAG.getBitcast(VT, Result);
17497 if (Opc == X86ISD::PCMPEQ && !Subtarget.hasSSE41()) {
17498 // If pcmpeqq is missing but pcmpeqd is available synthesize pcmpeqq with
17499 // pcmpeqd + pshufd + pand.
17500 assert(Subtarget.hasSSE2() && !FlipSigns && "Don't know how to lower!");
17502 // First cast everything to the right type.
17503 Op0 = DAG.getBitcast(MVT::v4i32, Op0);
17504 Op1 = DAG.getBitcast(MVT::v4i32, Op1);
17507 SDValue Result = DAG.getNode(Opc, dl, MVT::v4i32, Op0, Op1);
17509 // Make sure the lower and upper halves are both all-ones.
17510 static const int Mask[] = { 1, 0, 3, 2 };
17511 SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Result, Result, Mask);
17512 Result = DAG.getNode(ISD::AND, dl, MVT::v4i32, Result, Shuf);
17515 Result = DAG.getNOT(dl, Result, MVT::v4i32);
17517 return DAG.getBitcast(VT, Result);
17521 // Since SSE has no unsigned integer comparisons, we need to flip the sign
17522 // bits of the inputs before performing those operations.
17524 MVT EltVT = VT.getVectorElementType();
17525 SDValue SM = DAG.getConstant(APInt::getSignMask(EltVT.getSizeInBits()), dl,
17527 Op0 = DAG.getNode(ISD::XOR, dl, VT, Op0, SM);
17528 Op1 = DAG.getNode(ISD::XOR, dl, VT, Op1, SM);
17531 SDValue Result = DAG.getNode(Opc, dl, VT, Op0, Op1);
17533 // If the logical-not of the result is required, perform that now.
17535 Result = DAG.getNOT(dl, Result, VT);
17538 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Op0, Result);
17541 Result = DAG.getNode(X86ISD::PCMPEQ, dl, VT, Result,
17542 getZeroVector(VT, Subtarget, DAG, dl));
17547 SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
17549 MVT VT = Op.getSimpleValueType();
17551 if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
17553 assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
17554 SDValue Op0 = Op.getOperand(0);
17555 SDValue Op1 = Op.getOperand(1);
17557 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
17559 // Optimize to BT if possible.
17560 // Lower (X & (1 << N)) == 0 to BT(X, N).
17561 // Lower ((X >>u N) & 1) != 0 to BT(X, N).
17562 // Lower ((X >>s N) & 1) != 0 to BT(X, N).
17563 // Lower (trunc (X >> N) to i1) to BT(X, N).
17564 if (Op0.hasOneUse() && isNullConstant(Op1) &&
17565 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17566 if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
17568 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
17573 // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
17575 if ((isOneConstant(Op1) || isNullConstant(Op1)) &&
17576 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17578 // If the input is a setcc, then reuse the input setcc or use a new one with
17579 // the inverted condition.
17580 if (Op0.getOpcode() == X86ISD::SETCC) {
17581 X86::CondCode CCode = (X86::CondCode)Op0.getConstantOperandVal(0);
17582 bool Invert = (CC == ISD::SETNE) ^ isNullConstant(Op1);
17586 CCode = X86::GetOppositeBranchCondition(CCode);
17587 SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
17589 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17593 if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
17594 if (isOneConstant(Op1)) {
17595 ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
17596 return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
17598 if (!isNullConstant(Op1)) {
17599 SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
17600 return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
17604 bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
17605 X86::CondCode X86CC = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
17606 if (X86CC == X86::COND_INVALID)
17609 SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
17610 EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
17611 SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
17613 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
17617 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
17618 SDValue LHS = Op.getOperand(0);
17619 SDValue RHS = Op.getOperand(1);
17620 SDValue Carry = Op.getOperand(2);
17621 SDValue Cond = Op.getOperand(3);
17624 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
17625 X86::CondCode CC = TranslateIntegerX86CC(cast<CondCodeSDNode>(Cond)->get());
17627 // Recreate the carry if needed.
17628 EVT CarryVT = Carry.getValueType();
17629 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
17630 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
17631 Carry, DAG.getConstant(NegOne, DL, CarryVT));
17633 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17634 SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
17635 SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
17636 if (Op.getSimpleValueType() == MVT::i1)
17637 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
17641 /// Return true if opcode is a X86 logical comparison.
17642 static bool isX86LogicalCmp(SDValue Op) {
17643 unsigned Opc = Op.getOpcode();
17644 if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
17645 Opc == X86ISD::SAHF)
17647 if (Op.getResNo() == 1 &&
17648 (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
17649 Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
17650 Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
17651 Opc == X86ISD::XOR || Opc == X86ISD::AND))
17654 if (Op.getResNo() == 2 && Opc == X86ISD::UMUL)
17660 static bool isTruncWithZeroHighBitsInput(SDValue V, SelectionDAG &DAG) {
17661 if (V.getOpcode() != ISD::TRUNCATE)
17664 SDValue VOp0 = V.getOperand(0);
17665 unsigned InBits = VOp0.getValueSizeInBits();
17666 unsigned Bits = V.getValueSizeInBits();
17667 return DAG.MaskedValueIsZero(VOp0, APInt::getHighBitsSet(InBits,InBits-Bits));
17670 SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
17671 bool AddTest = true;
17672 SDValue Cond = Op.getOperand(0);
17673 SDValue Op1 = Op.getOperand(1);
17674 SDValue Op2 = Op.getOperand(2);
17676 MVT VT = Op1.getSimpleValueType();
17679 // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
17680 // are available or VBLENDV if AVX is available.
17681 // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
17682 if (Cond.getOpcode() == ISD::SETCC &&
17683 ((Subtarget.hasSSE2() && (VT == MVT::f32 || VT == MVT::f64)) ||
17684 (Subtarget.hasSSE1() && VT == MVT::f32)) &&
17685 VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
17686 SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
17687 int SSECC = translateX86FSETCC(
17688 cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
17691 if (Subtarget.hasAVX512()) {
17692 SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
17693 CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
17694 return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
17695 DL, VT, Cmp, Op1, Op2);
17698 SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
17699 DAG.getConstant(SSECC, DL, MVT::i8));
17701 // If we have AVX, we can use a variable vector select (VBLENDV) instead
17702 // of 3 logic instructions for size savings and potentially speed.
17703 // Unfortunately, there is no scalar form of VBLENDV.
17705 // If either operand is a constant, don't try this. We can expect to
17706 // optimize away at least one of the logic instructions later in that
17707 // case, so that sequence would be faster than a variable blend.
17709 // BLENDV was introduced with SSE 4.1, but the 2 register form implicitly
17710 // uses XMM0 as the selection register. That may need just as many
17711 // instructions as the AND/ANDN/OR sequence due to register moves, so
17714 if (Subtarget.hasAVX() &&
17715 !isa<ConstantFPSDNode>(Op1) && !isa<ConstantFPSDNode>(Op2)) {
17717 // Convert to vectors, do a VSELECT, and convert back to scalar.
17718 // All of the conversions should be optimized away.
17720 MVT VecVT = VT == MVT::f32 ? MVT::v4f32 : MVT::v2f64;
17721 SDValue VOp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op1);
17722 SDValue VOp2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Op2);
17723 SDValue VCmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Cmp);
17725 MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64;
17726 VCmp = DAG.getBitcast(VCmpVT, VCmp);
17728 SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2);
17730 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
17731 VSel, DAG.getIntPtrConstant(0, DL));
17733 SDValue AndN = DAG.getNode(X86ISD::FANDN, DL, VT, Cmp, Op2);
17734 SDValue And = DAG.getNode(X86ISD::FAND, DL, VT, Cmp, Op1);
17735 return DAG.getNode(X86ISD::FOR, DL, VT, AndN, And);
17739 // AVX512 fallback is to lower selects of scalar floats to masked moves.
17740 if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
17741 SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
17742 return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
17745 if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
17747 if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
17748 Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
17749 else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
17750 Op1Scalar = Op1.getOperand(0);
17752 if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
17753 Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
17754 else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
17755 Op2Scalar = Op2.getOperand(0);
17756 if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
17757 SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
17758 Op1Scalar, Op2Scalar);
17759 if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
17760 return DAG.getBitcast(VT, newSelect);
17761 SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
17762 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
17763 DAG.getIntPtrConstant(0, DL));
17767 if (VT == MVT::v4i1 || VT == MVT::v2i1) {
17768 SDValue zeroConst = DAG.getIntPtrConstant(0, DL);
17769 Op1 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17770 DAG.getUNDEF(MVT::v8i1), Op1, zeroConst);
17771 Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1,
17772 DAG.getUNDEF(MVT::v8i1), Op2, zeroConst);
17773 SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2);
17774 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst);
17777 if (Cond.getOpcode() == ISD::SETCC) {
17778 if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
17780 // If the condition was updated, it's possible that the operands of the
17781 // select were also updated (for example, EmitTest has a RAUW). Refresh
17782 // the local references to the select operands in case they got stale.
17783 Op1 = Op.getOperand(1);
17784 Op2 = Op.getOperand(2);
17788 // (select (x == 0), -1, y) -> (sign_bit (x - 1)) | y
17789 // (select (x == 0), y, -1) -> ~(sign_bit (x - 1)) | y
17790 // (select (x != 0), y, -1) -> (sign_bit (x - 1)) | y
17791 // (select (x != 0), -1, y) -> ~(sign_bit (x - 1)) | y
17792 // (select (and (x , 0x1) == 0), y, (z ^ y) ) -> (-(and (x , 0x1)) & z ) ^ y
17793 // (select (and (x , 0x1) == 0), y, (z | y) ) -> (-(and (x , 0x1)) & z ) | y
17794 if (Cond.getOpcode() == X86ISD::SETCC &&
17795 Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
17796 isNullConstant(Cond.getOperand(1).getOperand(1))) {
17797 SDValue Cmp = Cond.getOperand(1);
17798 unsigned CondCode =
17799 cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
17801 if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17802 (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
17803 SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
17804 SDValue CmpOp0 = Cmp.getOperand(0);
17806 // Apply further optimizations for special cases
17807 // (select (x != 0), -1, 0) -> neg & sbb
17808 // (select (x == 0), 0, -1) -> neg & sbb
17809 if (isNullConstant(Y) &&
17810 (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
17811 SDVTList VTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
17812 SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
17813 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, VTs, Zero, CmpOp0);
17814 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17815 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17816 SDValue(Neg.getNode(), 1));
17820 Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
17821 CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
17822 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
17824 SDValue Res = // Res = 0 or -1.
17825 DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17826 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp);
17828 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
17829 Res = DAG.getNOT(DL, Res, Res.getValueType());
17831 if (!isNullConstant(Op2))
17832 Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
17834 } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
17835 Cmp.getOperand(0).getOpcode() == ISD::AND &&
17836 isOneConstant(Cmp.getOperand(0).getOperand(1))) {
17837 SDValue CmpOp0 = Cmp.getOperand(0);
17838 SDValue Src1, Src2;
17839 // true if Op2 is XOR or OR operator and one of its operands
17841 // ( a , a op b) || ( b , a op b)
17842 auto isOrXorPattern = [&]() {
17843 if ((Op2.getOpcode() == ISD::XOR || Op2.getOpcode() == ISD::OR) &&
17844 (Op2.getOperand(0) == Op1 || Op2.getOperand(1) == Op1)) {
17846 Op2.getOperand(0) == Op1 ? Op2.getOperand(1) : Op2.getOperand(0);
17853 if (isOrXorPattern()) {
17855 unsigned int CmpSz = CmpOp0.getSimpleValueType().getSizeInBits();
17856 // we need mask of all zeros or ones with same size of the other
17858 if (CmpSz > VT.getSizeInBits())
17859 Neg = DAG.getNode(ISD::TRUNCATE, DL, VT, CmpOp0);
17860 else if (CmpSz < VT.getSizeInBits())
17861 Neg = DAG.getNode(ISD::AND, DL, VT,
17862 DAG.getNode(ISD::ANY_EXTEND, DL, VT, CmpOp0.getOperand(0)),
17863 DAG.getConstant(1, DL, VT));
17866 SDValue Mask = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
17867 Neg); // -(and (x, 0x1))
17868 SDValue And = DAG.getNode(ISD::AND, DL, VT, Mask, Src1); // Mask & z
17869 return DAG.getNode(Op2.getOpcode(), DL, VT, And, Src2); // And Op y
17874 // Look past (and (setcc_carry (cmp ...)), 1).
17875 if (Cond.getOpcode() == ISD::AND &&
17876 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
17877 isOneConstant(Cond.getOperand(1)))
17878 Cond = Cond.getOperand(0);
17880 // If condition flag is set by a X86ISD::CMP, then use it as the condition
17881 // setting operand in place of the X86ISD::SETCC.
17882 unsigned CondOpcode = Cond.getOpcode();
17883 if (CondOpcode == X86ISD::SETCC ||
17884 CondOpcode == X86ISD::SETCC_CARRY) {
17885 CC = Cond.getOperand(0);
17887 SDValue Cmp = Cond.getOperand(1);
17888 unsigned Opc = Cmp.getOpcode();
17889 MVT VT = Op.getSimpleValueType();
17891 bool IllegalFPCMov = false;
17892 if (VT.isFloatingPoint() && !VT.isVector() &&
17893 !isScalarFPTypeInSSEReg(VT)) // FPStack?
17894 IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
17896 if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
17897 Opc == X86ISD::BT) { // FIXME
17901 } else if (CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
17902 CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
17903 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
17904 Cond.getOperand(0).getValueType() != MVT::i8)) {
17905 SDValue LHS = Cond.getOperand(0);
17906 SDValue RHS = Cond.getOperand(1);
17907 unsigned X86Opcode;
17910 switch (CondOpcode) {
17911 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
17912 case ISD::SADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
17913 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
17914 case ISD::SSUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
17915 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
17916 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
17917 default: llvm_unreachable("unexpected overflowing operator");
17919 if (CondOpcode == ISD::UMULO)
17920 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
17923 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
17925 SDValue X86Op = DAG.getNode(X86Opcode, DL, VTs, LHS, RHS);
17927 if (CondOpcode == ISD::UMULO)
17928 Cond = X86Op.getValue(2);
17930 Cond = X86Op.getValue(1);
17932 CC = DAG.getConstant(X86Cond, DL, MVT::i8);
17937 // Look past the truncate if the high bits are known zero.
17938 if (isTruncWithZeroHighBitsInput(Cond, DAG))
17939 Cond = Cond.getOperand(0);
17941 // We know the result of AND is compared against zero. Try to match
17943 if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
17944 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
17945 CC = NewSetCC.getOperand(0);
17946 Cond = NewSetCC.getOperand(1);
17953 CC = DAG.getConstant(X86::COND_NE, DL, MVT::i8);
17954 Cond = EmitTest(Cond, X86::COND_NE, DL, DAG);
17957 // a < b ? -1 : 0 -> RES = ~setcc_carry
17958 // a < b ? 0 : -1 -> RES = setcc_carry
17959 // a >= b ? -1 : 0 -> RES = setcc_carry
17960 // a >= b ? 0 : -1 -> RES = ~setcc_carry
17961 if (Cond.getOpcode() == X86ISD::SUB) {
17962 Cond = ConvertCmpIfNecessary(Cond, DAG);
17963 unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
17965 if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
17966 (isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
17967 (isNullConstant(Op1) || isNullConstant(Op2))) {
17968 SDValue Res = DAG.getNode(X86ISD::SETCC_CARRY, DL, Op.getValueType(),
17969 DAG.getConstant(X86::COND_B, DL, MVT::i8),
17971 if (isAllOnesConstant(Op1) != (CondCode == X86::COND_B))
17972 return DAG.getNOT(DL, Res, Res.getValueType());
17977 // X86 doesn't have an i8 cmov. If both operands are the result of a truncate
17978 // widen the cmov and push the truncate through. This avoids introducing a new
17979 // branch during isel and doesn't add any extensions.
17980 if (Op.getValueType() == MVT::i8 &&
17981 Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
17982 SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
17983 if (T1.getValueType() == T2.getValueType() &&
17984 // Blacklist CopyFromReg to avoid partial register stalls.
17985 T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
17986 SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
17987 SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
17988 return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
17992 // X86ISD::CMOV means set the result (which is operand 1) to the RHS if
17993 // condition is true.
17994 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
17995 SDValue Ops[] = { Op2, Op1, CC, Cond };
17996 return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
17999 static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
18000 const X86Subtarget &Subtarget,
18001 SelectionDAG &DAG) {
18002 MVT VT = Op->getSimpleValueType(0);
18003 SDValue In = Op->getOperand(0);
18004 MVT InVT = In.getSimpleValueType();
18005 MVT VTElt = VT.getVectorElementType();
18006 MVT InVTElt = InVT.getVectorElementType();
18010 if ((InVTElt == MVT::i1) &&
18011 (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
18013 ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
18015 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18017 unsigned NumElts = VT.getVectorNumElements();
18019 if (VT.is512BitVector() && InVTElt != MVT::i1 &&
18020 (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
18021 if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
18022 return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
18023 return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
18026 if (InVTElt != MVT::i1)
18030 if (!VT.is512BitVector() && !Subtarget.hasVLX())
18031 ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
18034 if (Subtarget.hasDQI()) {
18035 V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
18036 assert(!VT.is512BitVector() && "Unexpected vector type");
18038 SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
18039 SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
18040 V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
18045 return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
18048 // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
18049 // For sign extend this needs to handle all vector sizes and SSE4.1 and
18050 // non-SSE4.1 targets. For zero extend this should only handle inputs of
18051 // MVT::v64i8 when BWI is not supported, but AVX512 is.
18052 static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
18053 const X86Subtarget &Subtarget,
18054 SelectionDAG &DAG) {
18055 SDValue In = Op->getOperand(0);
18056 MVT VT = Op->getSimpleValueType(0);
18057 MVT InVT = In.getSimpleValueType();
18058 assert(VT.getSizeInBits() == InVT.getSizeInBits());
18060 MVT SVT = VT.getVectorElementType();
18061 MVT InSVT = InVT.getVectorElementType();
18062 assert(SVT.getSizeInBits() > InSVT.getSizeInBits());
18064 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
18066 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
18068 if (!(VT.is128BitVector() && Subtarget.hasSSE2()) &&
18069 !(VT.is256BitVector() && Subtarget.hasInt256()) &&
18070 !(VT.is512BitVector() && Subtarget.hasAVX512()))
18075 // For 256-bit vectors, we only need the lower (128-bit) half of the input.
18076 // For 512-bit vectors, we need 128-bits or 256-bits.
18077 if (VT.getSizeInBits() > 128) {
18078 // Input needs to be at least the same number of elements as output, and
18079 // at least 128-bits.
18080 int InSize = InSVT.getSizeInBits() * VT.getVectorNumElements();
18081 In = extractSubVector(In, 0, DAG, dl, std::max(InSize, 128));
18084 assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
18085 InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
18087 // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
18088 // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
18089 // need to be handled here for 256/512-bit results.
18090 if (Subtarget.hasInt256()) {
18091 assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
18092 unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
18093 X86ISD::VSEXT : X86ISD::VZEXT;
18094 return DAG.getNode(ExtOpc, dl, VT, In);
18097 // We should only get here for sign extend.
18098 assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
18099 "Unexpected opcode!");
18101 // pre-SSE41 targets unpack lower lanes and then sign-extend using SRAI.
18105 // As SRAI is only available on i16/i32 types, we expand only up to i32
18106 // and handle i64 separately.
18107 while (CurrVT != VT && CurrVT.getVectorElementType() != MVT::i32) {
18108 Curr = DAG.getNode(X86ISD::UNPCKL, dl, CurrVT, DAG.getUNDEF(CurrVT), Curr);
18109 MVT CurrSVT = MVT::getIntegerVT(CurrVT.getScalarSizeInBits() * 2);
18110 CurrVT = MVT::getVectorVT(CurrSVT, CurrVT.getVectorNumElements() / 2);
18111 Curr = DAG.getBitcast(CurrVT, Curr);
18114 SDValue SignExt = Curr;
18115 if (CurrVT != InVT) {
18116 unsigned SignExtShift =
18117 CurrVT.getScalarSizeInBits() - InSVT.getSizeInBits();
18118 SignExt = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18119 DAG.getConstant(SignExtShift, dl, MVT::i8));
18125 if (VT == MVT::v2i64 && CurrVT == MVT::v4i32) {
18126 SDValue Sign = DAG.getNode(X86ISD::VSRAI, dl, CurrVT, Curr,
18127 DAG.getConstant(31, dl, MVT::i8));
18128 SDValue Ext = DAG.getVectorShuffle(CurrVT, dl, SignExt, Sign, {0, 4, 1, 5});
18129 return DAG.getBitcast(VT, Ext);
18135 static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
18136 SelectionDAG &DAG) {
18137 MVT VT = Op->getSimpleValueType(0);
18138 SDValue In = Op->getOperand(0);
18139 MVT InVT = In.getSimpleValueType();
18142 if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
18143 return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
18145 if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
18146 (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
18147 (VT != MVT::v16i16 || InVT != MVT::v16i8))
18150 if (Subtarget.hasInt256())
18151 return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
18153 // Optimize vectors in AVX mode
18154 // Sign extend v8i16 to v8i32 and
18157 // Divide input vector into two parts
18158 // for v4i32 the shuffle mask will be { 0, 1, -1, -1} {2, 3, -1, -1}
18159 // use vpmovsx instruction to extend v4i32 -> v2i64; v8i16 -> v4i32
18160 // concat the vectors to original VT
18162 unsigned NumElems = InVT.getVectorNumElements();
18163 SDValue Undef = DAG.getUNDEF(InVT);
18165 SmallVector<int,8> ShufMask1(NumElems, -1);
18166 for (unsigned i = 0; i != NumElems/2; ++i)
18169 SDValue OpLo = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask1);
18171 SmallVector<int,8> ShufMask2(NumElems, -1);
18172 for (unsigned i = 0; i != NumElems/2; ++i)
18173 ShufMask2[i] = i + NumElems/2;
18175 SDValue OpHi = DAG.getVectorShuffle(InVT, dl, In, Undef, ShufMask2);
18177 MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
18178 VT.getVectorNumElements() / 2);
18180 OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
18181 OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
18183 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
18186 // Lower truncating store. We need a special lowering to vXi1 vectors
18187 static SDValue LowerTruncatingStore(SDValue StOp, const X86Subtarget &Subtarget,
18188 SelectionDAG &DAG) {
18189 StoreSDNode *St = cast<StoreSDNode>(StOp.getNode());
18191 EVT MemVT = St->getMemoryVT();
18192 assert(St->isTruncatingStore() && "We only custom truncating store.");
18193 assert(MemVT.isVector() && MemVT.getVectorElementType() == MVT::i1 &&
18194 "Expected truncstore of i1 vector");
18196 SDValue Op = St->getValue();
18197 MVT OpVT = Op.getValueType().getSimpleVT();
18198 unsigned NumElts = OpVT.getVectorNumElements();
18199 if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
18201 // Truncate and store - everything is legal
18202 Op = DAG.getNode(ISD::TRUNCATE, dl, MemVT, Op);
18203 if (MemVT.getSizeInBits() < 8)
18204 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8i1,
18205 DAG.getUNDEF(MVT::v8i1), Op,
18206 DAG.getIntPtrConstant(0, dl));
18207 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18208 St->getMemOperand());
18211 // A subset, assume that we have only AVX-512F
18212 if (NumElts <= 8) {
18214 // Extend to 8-elts vector
18215 MVT ExtVT = MVT::getVectorVT(OpVT.getScalarType(), 8);
18216 Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ExtVT,
18217 DAG.getUNDEF(ExtVT), Op, DAG.getIntPtrConstant(0, dl));
18219 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i1, Op);
18220 return DAG.getStore(St->getChain(), dl, Op, St->getBasePtr(),
18221 St->getMemOperand());
18224 assert(OpVT == MVT::v32i8 && "Unexpected operand type");
18225 // Divide the vector into 2 parts and store each part separately
18226 SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18227 DAG.getIntPtrConstant(0, dl));
18228 Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Lo);
18229 SDValue BasePtr = St->getBasePtr();
18230 SDValue StLo = DAG.getStore(St->getChain(), dl, Lo, BasePtr,
18231 St->getMemOperand());
18232 SDValue Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, Op,
18233 DAG.getIntPtrConstant(16, dl));
18234 Hi = DAG.getNode(ISD::TRUNCATE, dl, MVT::v16i1, Hi);
18236 SDValue BasePtrHi =
18237 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18238 DAG.getConstant(2, dl, BasePtr.getValueType()));
18240 SDValue StHi = DAG.getStore(St->getChain(), dl, Hi,
18241 BasePtrHi, St->getMemOperand());
18242 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, StLo, StHi);
18245 static SDValue LowerExtended1BitVectorLoad(SDValue Op,
18246 const X86Subtarget &Subtarget,
18247 SelectionDAG &DAG) {
18249 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18251 EVT MemVT = Ld->getMemoryVT();
18252 assert(MemVT.isVector() && MemVT.getScalarType() == MVT::i1 &&
18253 "Expected i1 vector load");
18254 unsigned ExtOpcode = Ld->getExtensionType() == ISD::ZEXTLOAD ?
18255 ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
18256 MVT VT = Op.getValueType().getSimpleVT();
18257 unsigned NumElts = VT.getVectorNumElements();
18259 if ((Subtarget.hasBWI() && NumElts >= 32) ||
18260 (Subtarget.hasDQI() && NumElts < 16) ||
18262 // Load and extend - everything is legal
18264 SDValue Load = DAG.getLoad(MVT::v8i1, dl, Ld->getChain(),
18266 Ld->getMemOperand());
18267 // Replace chain users with the new chain.
18268 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18269 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18270 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18271 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, Load);
18273 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18274 DAG.getIntPtrConstant(0, dl));
18276 SDValue Load = DAG.getLoad(MemVT, dl, Ld->getChain(),
18278 Ld->getMemOperand());
18279 // Replace chain users with the new chain.
18280 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18281 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18283 // Finally, do a normal sign-extend to the desired register.
18284 return DAG.getNode(ExtOpcode, dl, Op.getValueType(), Load);
18287 if (NumElts <= 8) {
18288 // A subset, assume that we have only AVX-512F
18289 unsigned NumBitsToLoad = 8;
18290 MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
18291 SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
18293 Ld->getMemOperand());
18294 // Replace chain users with the new chain.
18295 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18296 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18298 MVT MaskVT = MVT::getVectorVT(MVT::i1, NumBitsToLoad);
18299 SDValue BitVec = DAG.getBitcast(MaskVT, Load);
18302 return DAG.getNode(ExtOpcode, dl, VT, BitVec);
18304 // we should take care to v4i1 and v2i1
18306 MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 8);
18307 SDValue ExtVec = DAG.getNode(ExtOpcode, dl, ExtVT, BitVec);
18308 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, ExtVec,
18309 DAG.getIntPtrConstant(0, dl));
18312 assert(VT == MVT::v32i8 && "Unexpected extload type");
18314 SmallVector<SDValue, 2> Chains;
18316 SDValue BasePtr = Ld->getBasePtr();
18317 SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18319 Ld->getMemOperand());
18320 Chains.push_back(LoadLo.getValue(1));
18322 SDValue BasePtrHi =
18323 DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
18324 DAG.getConstant(2, dl, BasePtr.getValueType()));
18326 SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
18328 Ld->getMemOperand());
18329 Chains.push_back(LoadHi.getValue(1));
18330 SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18331 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
18333 SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
18334 SDValue Hi = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadHi);
18335 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v32i8, Lo, Hi);
18338 // Lower vector extended loads using a shuffle. If SSSE3 is not available we
18339 // may emit an illegal shuffle but the expansion is still better than scalar
18340 // code. We generate X86ISD::VSEXT for SEXTLOADs if it's available, otherwise
18341 // we'll emit a shuffle and a arithmetic shift.
18342 // FIXME: Is the expansion actually better than scalar code? It doesn't seem so.
18343 // TODO: It is possible to support ZExt by zeroing the undef values during
18344 // the shuffle phase or after the shuffle.
18345 static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
18346 SelectionDAG &DAG) {
18347 MVT RegVT = Op.getSimpleValueType();
18348 assert(RegVT.isVector() && "We only custom lower vector sext loads.");
18349 assert(RegVT.isInteger() &&
18350 "We only custom lower integer vector sext loads.");
18352 // Nothing useful we can do without SSE2 shuffles.
18353 assert(Subtarget.hasSSE2() && "We only custom lower sext loads with SSE2.");
18355 LoadSDNode *Ld = cast<LoadSDNode>(Op.getNode());
18357 EVT MemVT = Ld->getMemoryVT();
18358 if (MemVT.getScalarType() == MVT::i1)
18359 return LowerExtended1BitVectorLoad(Op, Subtarget, DAG);
18361 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18362 unsigned RegSz = RegVT.getSizeInBits();
18364 ISD::LoadExtType Ext = Ld->getExtensionType();
18366 assert((Ext == ISD::EXTLOAD || Ext == ISD::SEXTLOAD)
18367 && "Only anyext and sext are currently implemented.");
18368 assert(MemVT != RegVT && "Cannot extend to the same type");
18369 assert(MemVT.isVector() && "Must load a vector from memory");
18371 unsigned NumElems = RegVT.getVectorNumElements();
18372 unsigned MemSz = MemVT.getSizeInBits();
18373 assert(RegSz > MemSz && "Register size must be greater than the mem size");
18375 if (Ext == ISD::SEXTLOAD && RegSz == 256 && !Subtarget.hasInt256()) {
18376 // The only way in which we have a legal 256-bit vector result but not the
18377 // integer 256-bit operations needed to directly lower a sextload is if we
18378 // have AVX1 but not AVX2. In that case, we can always emit a sextload to
18379 // a 128-bit vector and a normal sign_extend to 256-bits that should get
18380 // correctly legalized. We do this late to allow the canonical form of
18381 // sextload to persist throughout the rest of the DAG combiner -- it wants
18382 // to fold together any extensions it can, and so will fuse a sign_extend
18383 // of an sextload into a sextload targeting a wider value.
18385 if (MemSz == 128) {
18386 // Just switch this to a normal load.
18387 assert(TLI.isTypeLegal(MemVT) && "If the memory type is a 128-bit type, "
18388 "it must be a legal 128-bit vector "
18390 Load = DAG.getLoad(MemVT, dl, Ld->getChain(), Ld->getBasePtr(),
18391 Ld->getPointerInfo(), Ld->getAlignment(),
18392 Ld->getMemOperand()->getFlags());
18394 assert(MemSz < 128 &&
18395 "Can't extend a type wider than 128 bits to a 256 bit vector!");
18396 // Do an sext load to a 128-bit vector type. We want to use the same
18397 // number of elements, but elements half as wide. This will end up being
18398 // recursively lowered by this routine, but will succeed as we definitely
18399 // have all the necessary features if we're using AVX1.
18401 EVT::getIntegerVT(*DAG.getContext(), RegVT.getScalarSizeInBits() / 2);
18402 EVT HalfVecVT = EVT::getVectorVT(*DAG.getContext(), HalfEltVT, NumElems);
18404 DAG.getExtLoad(Ext, dl, HalfVecVT, Ld->getChain(), Ld->getBasePtr(),
18405 Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
18406 Ld->getMemOperand()->getFlags());
18409 // Replace chain users with the new chain.
18410 assert(Load->getNumValues() == 2 && "Loads must carry a chain!");
18411 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
18413 // Finally, do a normal sign-extend to the desired register.
18414 return DAG.getSExtOrTrunc(Load, dl, RegVT);
18417 // All sizes must be a power of two.
18418 assert(isPowerOf2_32(RegSz * MemSz * NumElems) &&
18419 "Non-power-of-two elements are not custom lowered!");
18421 // Attempt to load the original value using scalar loads.
18422 // Find the largest scalar type that divides the total loaded size.
18423 MVT SclrLoadTy = MVT::i8;
18424 for (MVT Tp : MVT::integer_valuetypes()) {
18425 if (TLI.isTypeLegal(Tp) && ((MemSz % Tp.getSizeInBits()) == 0)) {
18430 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
18431 if (TLI.isTypeLegal(MVT::f64) && SclrLoadTy.getSizeInBits() < 64 &&
18433 SclrLoadTy = MVT::f64;
18435 // Calculate the number of scalar loads that we need to perform
18436 // in order to load our vector from memory.
18437 unsigned NumLoads = MemSz / SclrLoadTy.getSizeInBits();
18439 assert((Ext != ISD::SEXTLOAD || NumLoads == 1) &&
18440 "Can only lower sext loads with a single scalar load!");
18442 unsigned loadRegZize = RegSz;
18443 if (Ext == ISD::SEXTLOAD && RegSz >= 256)
18446 // Represent our vector as a sequence of elements which are the
18447 // largest scalar that we can load.
18448 EVT LoadUnitVecVT = EVT::getVectorVT(
18449 *DAG.getContext(), SclrLoadTy, loadRegZize / SclrLoadTy.getSizeInBits());
18451 // Represent the data using the same element type that is stored in
18452 // memory. In practice, we ''widen'' MemVT.
18454 EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
18455 loadRegZize / MemVT.getScalarSizeInBits());
18457 assert(WideVecVT.getSizeInBits() == LoadUnitVecVT.getSizeInBits() &&
18458 "Invalid vector type");
18460 // We can't shuffle using an illegal type.
18461 assert(TLI.isTypeLegal(WideVecVT) &&
18462 "We only lower types that form legal widened vector types");
18464 SmallVector<SDValue, 8> Chains;
18465 SDValue Ptr = Ld->getBasePtr();
18466 SDValue Increment = DAG.getConstant(SclrLoadTy.getSizeInBits() / 8, dl,
18467 TLI.getPointerTy(DAG.getDataLayout()));
18468 SDValue Res = DAG.getUNDEF(LoadUnitVecVT);
18470 for (unsigned i = 0; i < NumLoads; ++i) {
18471 // Perform a single load.
18472 SDValue ScalarLoad =
18473 DAG.getLoad(SclrLoadTy, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
18474 Ld->getAlignment(), Ld->getMemOperand()->getFlags());
18475 Chains.push_back(ScalarLoad.getValue(1));
18476 // Create the first element type using SCALAR_TO_VECTOR in order to avoid
18477 // another round of DAGCombining.
18479 Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoadUnitVecVT, ScalarLoad);
18481 Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, LoadUnitVecVT, Res,
18482 ScalarLoad, DAG.getIntPtrConstant(i, dl));
18484 Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr, Increment);
18487 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
18489 // Bitcast the loaded value to a vector of the original element type, in
18490 // the size of the target vector type.
18491 SDValue SlicedVec = DAG.getBitcast(WideVecVT, Res);
18492 unsigned SizeRatio = RegSz / MemSz;
18494 if (Ext == ISD::SEXTLOAD) {
18495 // If we have SSE4.1, we can directly emit a VSEXT node.
18496 if (Subtarget.hasSSE41()) {
18497 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
18498 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18502 // Otherwise we'll use SIGN_EXTEND_VECTOR_INREG to sign extend the lowest
18504 assert(TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND_VECTOR_INREG, RegVT) &&
18505 "We can't implement a sext load without SIGN_EXTEND_VECTOR_INREG!");
18507 SDValue Shuff = DAG.getSignExtendVectorInReg(SlicedVec, dl, RegVT);
18508 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18512 // Redistribute the loaded elements into the different locations.
18513 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
18514 for (unsigned i = 0; i != NumElems; ++i)
18515 ShuffleVec[i * SizeRatio] = i;
18517 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, SlicedVec,
18518 DAG.getUNDEF(WideVecVT), ShuffleVec);
18520 // Bitcast to the requested type.
18521 Shuff = DAG.getBitcast(RegVT, Shuff);
18522 DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
18526 /// Return true if node is an ISD::AND or ISD::OR of two X86ISD::SETCC nodes
18527 /// each of which has no other use apart from the AND / OR.
18528 static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
18529 Opc = Op.getOpcode();
18530 if (Opc != ISD::OR && Opc != ISD::AND)
18532 return (Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18533 Op.getOperand(0).hasOneUse() &&
18534 Op.getOperand(1).getOpcode() == X86ISD::SETCC &&
18535 Op.getOperand(1).hasOneUse());
18538 /// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
18539 /// SETCC node has a single use.
18540 static bool isXor1OfSetCC(SDValue Op) {
18541 if (Op.getOpcode() != ISD::XOR)
18543 if (isOneConstant(Op.getOperand(1)))
18544 return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
18545 Op.getOperand(0).hasOneUse();
18549 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
18550 bool addTest = true;
18551 SDValue Chain = Op.getOperand(0);
18552 SDValue Cond = Op.getOperand(1);
18553 SDValue Dest = Op.getOperand(2);
18556 bool Inverted = false;
18558 if (Cond.getOpcode() == ISD::SETCC) {
18559 // Check for setcc([su]{add,sub,mul}o == 0).
18560 if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
18561 isNullConstant(Cond.getOperand(1)) &&
18562 Cond.getOperand(0).getResNo() == 1 &&
18563 (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
18564 Cond.getOperand(0).getOpcode() == ISD::UADDO ||
18565 Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
18566 Cond.getOperand(0).getOpcode() == ISD::USUBO ||
18567 Cond.getOperand(0).getOpcode() == ISD::SMULO ||
18568 Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
18570 Cond = Cond.getOperand(0);
18572 if (SDValue NewCond = LowerSETCC(Cond, DAG))
18577 // FIXME: LowerXALUO doesn't handle these!!
18578 else if (Cond.getOpcode() == X86ISD::ADD ||
18579 Cond.getOpcode() == X86ISD::SUB ||
18580 Cond.getOpcode() == X86ISD::SMUL ||
18581 Cond.getOpcode() == X86ISD::UMUL)
18582 Cond = LowerXALUO(Cond, DAG);
18585 // Look pass (and (setcc_carry (cmp ...)), 1).
18586 if (Cond.getOpcode() == ISD::AND &&
18587 Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
18588 isOneConstant(Cond.getOperand(1)))
18589 Cond = Cond.getOperand(0);
18591 // If condition flag is set by a X86ISD::CMP, then use it as the condition
18592 // setting operand in place of the X86ISD::SETCC.
18593 unsigned CondOpcode = Cond.getOpcode();
18594 if (CondOpcode == X86ISD::SETCC ||
18595 CondOpcode == X86ISD::SETCC_CARRY) {
18596 CC = Cond.getOperand(0);
18598 SDValue Cmp = Cond.getOperand(1);
18599 unsigned Opc = Cmp.getOpcode();
18600 // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
18601 if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
18605 switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
18609 // These can only come from an arithmetic instruction with overflow,
18610 // e.g. SADDO, UADDO.
18611 Cond = Cond.getOperand(1);
18617 CondOpcode = Cond.getOpcode();
18618 if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
18619 CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
18620 ((CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) &&
18621 Cond.getOperand(0).getValueType() != MVT::i8)) {
18622 SDValue LHS = Cond.getOperand(0);
18623 SDValue RHS = Cond.getOperand(1);
18624 unsigned X86Opcode;
18627 // Keep this in sync with LowerXALUO, otherwise we might create redundant
18628 // instructions that can't be removed afterwards (i.e. X86ISD::ADD and
18630 switch (CondOpcode) {
18631 case ISD::UADDO: X86Opcode = X86ISD::ADD; X86Cond = X86::COND_B; break;
18633 if (isOneConstant(RHS)) {
18634 X86Opcode = X86ISD::INC; X86Cond = X86::COND_O;
18637 X86Opcode = X86ISD::ADD; X86Cond = X86::COND_O; break;
18638 case ISD::USUBO: X86Opcode = X86ISD::SUB; X86Cond = X86::COND_B; break;
18640 if (isOneConstant(RHS)) {
18641 X86Opcode = X86ISD::DEC; X86Cond = X86::COND_O;
18644 X86Opcode = X86ISD::SUB; X86Cond = X86::COND_O; break;
18645 case ISD::UMULO: X86Opcode = X86ISD::UMUL; X86Cond = X86::COND_O; break;
18646 case ISD::SMULO: X86Opcode = X86ISD::SMUL; X86Cond = X86::COND_O; break;
18647 default: llvm_unreachable("unexpected overflowing operator");
18650 X86Cond = X86::GetOppositeBranchCondition((X86::CondCode)X86Cond);
18651 if (CondOpcode == ISD::UMULO)
18652 VTs = DAG.getVTList(LHS.getValueType(), LHS.getValueType(),
18655 VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
18657 SDValue X86Op = DAG.getNode(X86Opcode, dl, VTs, LHS, RHS);
18659 if (CondOpcode == ISD::UMULO)
18660 Cond = X86Op.getValue(2);
18662 Cond = X86Op.getValue(1);
18664 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18668 if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
18669 SDValue Cmp = Cond.getOperand(0).getOperand(1);
18670 if (CondOpc == ISD::OR) {
18671 // Also, recognize the pattern generated by an FCMP_UNE. We can emit
18672 // two branches instead of an explicit OR instruction with a
18674 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18675 isX86LogicalCmp(Cmp)) {
18676 CC = Cond.getOperand(0).getOperand(0);
18677 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18678 Chain, Dest, CC, Cmp);
18679 CC = Cond.getOperand(1).getOperand(0);
18683 } else { // ISD::AND
18684 // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
18685 // two branches instead of an explicit AND instruction with a
18686 // separate test. However, we only do this if this block doesn't
18687 // have a fall-through edge, because this requires an explicit
18688 // jmp when the condition is false.
18689 if (Cmp == Cond.getOperand(1).getOperand(1) &&
18690 isX86LogicalCmp(Cmp) &&
18691 Op.getNode()->hasOneUse()) {
18692 X86::CondCode CCode =
18693 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18694 CCode = X86::GetOppositeBranchCondition(CCode);
18695 CC = DAG.getConstant(CCode, dl, MVT::i8);
18696 SDNode *User = *Op.getNode()->use_begin();
18697 // Look for an unconditional branch following this conditional branch.
18698 // We need this because we need to reverse the successors in order
18699 // to implement FCMP_OEQ.
18700 if (User->getOpcode() == ISD::BR) {
18701 SDValue FalseBB = User->getOperand(1);
18703 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18704 assert(NewBR == User);
18708 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18709 Chain, Dest, CC, Cmp);
18710 X86::CondCode CCode =
18711 (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
18712 CCode = X86::GetOppositeBranchCondition(CCode);
18713 CC = DAG.getConstant(CCode, dl, MVT::i8);
18719 } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
18720 // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
18721 // It should be transformed during dag combiner except when the condition
18722 // is set by a arithmetics with overflow node.
18723 X86::CondCode CCode =
18724 (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
18725 CCode = X86::GetOppositeBranchCondition(CCode);
18726 CC = DAG.getConstant(CCode, dl, MVT::i8);
18727 Cond = Cond.getOperand(0).getOperand(1);
18729 } else if (Cond.getOpcode() == ISD::SETCC &&
18730 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
18731 // For FCMP_OEQ, we can emit
18732 // two branches instead of an explicit AND instruction with a
18733 // separate test. However, we only do this if this block doesn't
18734 // have a fall-through edge, because this requires an explicit
18735 // jmp when the condition is false.
18736 if (Op.getNode()->hasOneUse()) {
18737 SDNode *User = *Op.getNode()->use_begin();
18738 // Look for an unconditional branch following this conditional branch.
18739 // We need this because we need to reverse the successors in order
18740 // to implement FCMP_OEQ.
18741 if (User->getOpcode() == ISD::BR) {
18742 SDValue FalseBB = User->getOperand(1);
18744 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18745 assert(NewBR == User);
18749 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18750 Cond.getOperand(0), Cond.getOperand(1));
18751 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18752 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18753 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18754 Chain, Dest, CC, Cmp);
18755 CC = DAG.getConstant(X86::COND_P, dl, MVT::i8);
18760 } else if (Cond.getOpcode() == ISD::SETCC &&
18761 cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
18762 // For FCMP_UNE, we can emit
18763 // two branches instead of an explicit AND instruction with a
18764 // separate test. However, we only do this if this block doesn't
18765 // have a fall-through edge, because this requires an explicit
18766 // jmp when the condition is false.
18767 if (Op.getNode()->hasOneUse()) {
18768 SDNode *User = *Op.getNode()->use_begin();
18769 // Look for an unconditional branch following this conditional branch.
18770 // We need this because we need to reverse the successors in order
18771 // to implement FCMP_UNE.
18772 if (User->getOpcode() == ISD::BR) {
18773 SDValue FalseBB = User->getOperand(1);
18775 DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
18776 assert(NewBR == User);
18779 SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
18780 Cond.getOperand(0), Cond.getOperand(1));
18781 Cmp = ConvertCmpIfNecessary(Cmp, DAG);
18782 CC = DAG.getConstant(X86::COND_NE, dl, MVT::i8);
18783 Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18784 Chain, Dest, CC, Cmp);
18785 CC = DAG.getConstant(X86::COND_NP, dl, MVT::i8);
18795 // Look pass the truncate if the high bits are known zero.
18796 if (isTruncWithZeroHighBitsInput(Cond, DAG))
18797 Cond = Cond.getOperand(0);
18799 // We know the result is compared against zero. Try to match it to BT.
18800 if (Cond.hasOneUse()) {
18801 if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
18802 CC = NewSetCC.getOperand(0);
18803 Cond = NewSetCC.getOperand(1);
18810 X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
18811 CC = DAG.getConstant(X86Cond, dl, MVT::i8);
18812 Cond = EmitTest(Cond, X86Cond, dl, DAG);
18814 Cond = ConvertCmpIfNecessary(Cond, DAG);
18815 return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
18816 Chain, Dest, CC, Cond);
18819 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
18820 // Calls to _alloca are needed to probe the stack when allocating more than 4k
18821 // bytes in one go. Touching the stack at 4K increments is necessary to ensure
18822 // that the guard pages used by the OS virtual memory manager are allocated in
18823 // correct sequence.
18825 X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
18826 SelectionDAG &DAG) const {
18827 MachineFunction &MF = DAG.getMachineFunction();
18828 bool SplitStack = MF.shouldSplitStack();
18829 bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
18830 bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
18831 SplitStack || EmitStackProbe;
18835 SDNode *Node = Op.getNode();
18836 SDValue Chain = Op.getOperand(0);
18837 SDValue Size = Op.getOperand(1);
18838 unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
18839 EVT VT = Node->getValueType(0);
18841 // Chain the dynamic stack allocation so that it doesn't modify the stack
18842 // pointer when other instructions are using the stack.
18843 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
18845 bool Is64Bit = Subtarget.is64Bit();
18846 MVT SPTy = getPointerTy(DAG.getDataLayout());
18850 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18851 unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
18852 assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
18853 " not tell us which reg is the stack pointer!");
18855 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
18856 Chain = SP.getValue(1);
18857 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
18858 unsigned StackAlign = TFI.getStackAlignment();
18859 Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
18860 if (Align > StackAlign)
18861 Result = DAG.getNode(ISD::AND, dl, VT, Result,
18862 DAG.getConstant(-(uint64_t)Align, dl, VT));
18863 Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
18864 } else if (SplitStack) {
18865 MachineRegisterInfo &MRI = MF.getRegInfo();
18868 // The 64 bit implementation of segmented stacks needs to clobber both r10
18869 // r11. This makes it impossible to use it along with nested parameters.
18870 const Function *F = MF.getFunction();
18871 for (const auto &A : F->args()) {
18872 if (A.hasNestAttr())
18873 report_fatal_error("Cannot use segmented stacks with functions that "
18874 "have nested arguments.");
18878 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
18879 unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
18880 Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
18881 Result = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
18882 DAG.getRegister(Vreg, SPTy));
18884 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
18885 Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Size);
18886 MF.getInfo<X86MachineFunctionInfo>()->setHasWinAlloca(true);
18888 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
18889 unsigned SPReg = RegInfo->getStackRegister();
18890 SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
18891 Chain = SP.getValue(1);
18894 SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
18895 DAG.getConstant(-(uint64_t)Align, dl, VT));
18896 Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
18902 Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
18903 DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
18905 SDValue Ops[2] = {Result, Chain};
18906 return DAG.getMergeValues(Ops, dl);
18909 SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
18910 MachineFunction &MF = DAG.getMachineFunction();
18911 auto PtrVT = getPointerTy(MF.getDataLayout());
18912 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
18914 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18917 if (!Subtarget.is64Bit() ||
18918 Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
18919 // vastart just stores the address of the VarArgsFrameIndex slot into the
18920 // memory location argument.
18921 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18922 return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1),
18923 MachinePointerInfo(SV));
18927 // gp_offset (0 - 6 * 8)
18928 // fp_offset (48 - 48 + 8 * 16)
18929 // overflow_arg_area (point to parameters coming in memory).
18931 SmallVector<SDValue, 8> MemOps;
18932 SDValue FIN = Op.getOperand(1);
18934 SDValue Store = DAG.getStore(
18935 Op.getOperand(0), DL,
18936 DAG.getConstant(FuncInfo->getVarArgsGPOffset(), DL, MVT::i32), FIN,
18937 MachinePointerInfo(SV));
18938 MemOps.push_back(Store);
18941 FIN = DAG.getMemBasePlusOffset(FIN, 4, DL);
18942 Store = DAG.getStore(
18943 Op.getOperand(0), DL,
18944 DAG.getConstant(FuncInfo->getVarArgsFPOffset(), DL, MVT::i32), FIN,
18945 MachinePointerInfo(SV, 4));
18946 MemOps.push_back(Store);
18948 // Store ptr to overflow_arg_area
18949 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(4, DL));
18950 SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
18952 DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN, MachinePointerInfo(SV, 8));
18953 MemOps.push_back(Store);
18955 // Store ptr to reg_save_area.
18956 FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getIntPtrConstant(
18957 Subtarget.isTarget64BitLP64() ? 8 : 4, DL));
18958 SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(), PtrVT);
18959 Store = DAG.getStore(
18960 Op.getOperand(0), DL, RSFIN, FIN,
18961 MachinePointerInfo(SV, Subtarget.isTarget64BitLP64() ? 16 : 12));
18962 MemOps.push_back(Store);
18963 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
18966 SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
18967 assert(Subtarget.is64Bit() &&
18968 "LowerVAARG only handles 64-bit va_arg!");
18969 assert(Op.getNumOperands() == 4);
18971 MachineFunction &MF = DAG.getMachineFunction();
18972 if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
18973 // The Win64 ABI uses char* instead of a structure.
18974 return DAG.expandVAArg(Op.getNode());
18976 SDValue Chain = Op.getOperand(0);
18977 SDValue SrcPtr = Op.getOperand(1);
18978 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
18979 unsigned Align = Op.getConstantOperandVal(3);
18982 EVT ArgVT = Op.getNode()->getValueType(0);
18983 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
18984 uint32_t ArgSize = DAG.getDataLayout().getTypeAllocSize(ArgTy);
18987 // Decide which area this value should be read from.
18988 // TODO: Implement the AMD64 ABI in its entirety. This simple
18989 // selection mechanism works only for the basic types.
18990 if (ArgVT == MVT::f80) {
18991 llvm_unreachable("va_arg for f80 not yet implemented");
18992 } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
18993 ArgMode = 2; // Argument passed in XMM register. Use fp_offset.
18994 } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
18995 ArgMode = 1; // Argument passed in GPR64 register(s). Use gp_offset.
18997 llvm_unreachable("Unhandled argument type in LowerVAARG");
19000 if (ArgMode == 2) {
19001 // Sanity Check: Make sure using fp_offset makes sense.
19002 assert(!Subtarget.useSoftFloat() &&
19003 !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
19004 Subtarget.hasSSE1());
19007 // Insert VAARG_64 node into the DAG
19008 // VAARG_64 returns two values: Variable Argument Address, Chain
19009 SDValue InstOps[] = {Chain, SrcPtr, DAG.getConstant(ArgSize, dl, MVT::i32),
19010 DAG.getConstant(ArgMode, dl, MVT::i8),
19011 DAG.getConstant(Align, dl, MVT::i32)};
19012 SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
19013 SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
19014 VTs, InstOps, MVT::i64,
19015 MachinePointerInfo(SV),
19017 /*Volatile=*/false,
19019 /*WriteMem=*/true);
19020 Chain = VAARG.getValue(1);
19022 // Load the next argument and return it
19023 return DAG.getLoad(ArgVT, dl, Chain, VAARG, MachinePointerInfo());
19026 static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
19027 SelectionDAG &DAG) {
19028 // X86-64 va_list is a struct { i32, i32, i8*, i8* }, except on Windows,
19029 // where a va_list is still an i8*.
19030 assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
19031 if (Subtarget.isCallingConvWin64(
19032 DAG.getMachineFunction().getFunction()->getCallingConv()))
19033 // Probably a Win64 va_copy.
19034 return DAG.expandVACopy(Op.getNode());
19036 SDValue Chain = Op.getOperand(0);
19037 SDValue DstPtr = Op.getOperand(1);
19038 SDValue SrcPtr = Op.getOperand(2);
19039 const Value *DstSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
19040 const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
19043 return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
19044 DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
19046 MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
19049 /// Handle vector element shifts where the shift amount is a constant.
19050 /// Takes immediate version of shift as input.
19051 static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
19052 SDValue SrcOp, uint64_t ShiftAmt,
19053 SelectionDAG &DAG) {
19054 MVT ElementType = VT.getVectorElementType();
19056 // Bitcast the source vector to the output type, this is mainly necessary for
19057 // vXi8/vXi64 shifts.
19058 if (VT != SrcOp.getSimpleValueType())
19059 SrcOp = DAG.getBitcast(VT, SrcOp);
19061 // Fold this packed shift into its first operand if ShiftAmt is 0.
19065 // Check for ShiftAmt >= element width
19066 if (ShiftAmt >= ElementType.getSizeInBits()) {
19067 if (Opc == X86ISD::VSRAI)
19068 ShiftAmt = ElementType.getSizeInBits() - 1;
19070 return DAG.getConstant(0, dl, VT);
19073 assert((Opc == X86ISD::VSHLI || Opc == X86ISD::VSRLI || Opc == X86ISD::VSRAI)
19074 && "Unknown target vector shift-by-constant node");
19076 // Fold this packed vector shift into a build vector if SrcOp is a
19077 // vector of Constants or UNDEFs.
19078 if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
19079 SmallVector<SDValue, 8> Elts;
19080 unsigned NumElts = SrcOp->getNumOperands();
19081 ConstantSDNode *ND;
19084 default: llvm_unreachable("Unknown opcode!");
19085 case X86ISD::VSHLI:
19086 for (unsigned i=0; i!=NumElts; ++i) {
19087 SDValue CurrentOp = SrcOp->getOperand(i);
19088 if (CurrentOp->isUndef()) {
19089 Elts.push_back(CurrentOp);
19092 ND = cast<ConstantSDNode>(CurrentOp);
19093 const APInt &C = ND->getAPIntValue();
19094 Elts.push_back(DAG.getConstant(C.shl(ShiftAmt), dl, ElementType));
19097 case X86ISD::VSRLI:
19098 for (unsigned i=0; i!=NumElts; ++i) {
19099 SDValue CurrentOp = SrcOp->getOperand(i);
19100 if (CurrentOp->isUndef()) {
19101 Elts.push_back(CurrentOp);
19104 ND = cast<ConstantSDNode>(CurrentOp);
19105 const APInt &C = ND->getAPIntValue();
19106 Elts.push_back(DAG.getConstant(C.lshr(ShiftAmt), dl, ElementType));
19109 case X86ISD::VSRAI:
19110 for (unsigned i=0; i!=NumElts; ++i) {
19111 SDValue CurrentOp = SrcOp->getOperand(i);
19112 if (CurrentOp->isUndef()) {
19113 Elts.push_back(CurrentOp);
19116 ND = cast<ConstantSDNode>(CurrentOp);
19117 const APInt &C = ND->getAPIntValue();
19118 Elts.push_back(DAG.getConstant(C.ashr(ShiftAmt), dl, ElementType));
19123 return DAG.getBuildVector(VT, dl, Elts);
19126 return DAG.getNode(Opc, dl, VT, SrcOp,
19127 DAG.getConstant(ShiftAmt, dl, MVT::i8));
19130 /// Handle vector element shifts where the shift amount may or may not be a
19131 /// constant. Takes immediate version of shift as input.
19132 static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
19133 SDValue SrcOp, SDValue ShAmt,
19134 const X86Subtarget &Subtarget,
19135 SelectionDAG &DAG) {
19136 MVT SVT = ShAmt.getSimpleValueType();
19137 assert((SVT == MVT::i32 || SVT == MVT::i64) && "Unexpected value type!");
19139 // Catch shift-by-constant.
19140 if (ConstantSDNode *CShAmt = dyn_cast<ConstantSDNode>(ShAmt))
19141 return getTargetVShiftByConstNode(Opc, dl, VT, SrcOp,
19142 CShAmt->getZExtValue(), DAG);
19144 // Change opcode to non-immediate version
19146 default: llvm_unreachable("Unknown target vector shift node");
19147 case X86ISD::VSHLI: Opc = X86ISD::VSHL; break;
19148 case X86ISD::VSRLI: Opc = X86ISD::VSRL; break;
19149 case X86ISD::VSRAI: Opc = X86ISD::VSRA; break;
19152 // Need to build a vector containing shift amount.
19153 // SSE/AVX packed shifts only use the lower 64-bit of the shift count.
19154 // +=================+============+=======================================+
19155 // | ShAmt is | HasSSE4.1? | Construct ShAmt vector as |
19156 // +=================+============+=======================================+
19157 // | i64 | Yes, No | Use ShAmt as lowest elt |
19158 // | i32 | Yes | zero-extend in-reg |
19159 // | (i32 zext(i16)) | Yes | zero-extend in-reg |
19160 // | i16/i32 | No | v4i32 build_vector(ShAmt, 0, ud, ud)) |
19161 // +=================+============+=======================================+
19163 if (SVT == MVT::i64)
19164 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v2i64, ShAmt);
19165 else if (Subtarget.hasSSE41() && ShAmt.getOpcode() == ISD::ZERO_EXTEND &&
19166 ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
19167 ShAmt = ShAmt.getOperand(0);
19168 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
19169 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19170 } else if (Subtarget.hasSSE41() &&
19171 ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
19172 ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
19173 ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
19175 SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
19176 DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
19177 ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
19180 // The return type has to be a 128-bit type with the same element
19181 // type as the input type.
19182 MVT EltVT = VT.getVectorElementType();
19183 MVT ShVT = MVT::getVectorVT(EltVT, 128/EltVT.getSizeInBits());
19185 ShAmt = DAG.getBitcast(ShVT, ShAmt);
19186 return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
19189 /// \brief Return Mask with the necessary casting or extending
19190 /// for \p Mask according to \p MaskVT when lowering masking intrinsics
19191 static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
19192 const X86Subtarget &Subtarget, SelectionDAG &DAG,
19195 if (isAllOnesConstant(Mask))
19196 return DAG.getTargetConstant(1, dl, MaskVT);
19197 if (X86::isZeroNode(Mask))
19198 return DAG.getTargetConstant(0, dl, MaskVT);
19200 if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
19201 // Mask should be extended
19202 Mask = DAG.getNode(ISD::ANY_EXTEND, dl,
19203 MVT::getIntegerVT(MaskVT.getSizeInBits()), Mask);
19206 if (Mask.getSimpleValueType() == MVT::i64 && Subtarget.is32Bit()) {
19207 if (MaskVT == MVT::v64i1) {
19208 assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
19209 // In case 32bit mode, bitcast i64 is illegal, extend/split it.
19211 Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19212 DAG.getConstant(0, dl, MVT::i32));
19213 Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Mask,
19214 DAG.getConstant(1, dl, MVT::i32));
19216 Lo = DAG.getBitcast(MVT::v32i1, Lo);
19217 Hi = DAG.getBitcast(MVT::v32i1, Hi);
19219 return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
19221 // MaskVT require < 64bit. Truncate mask (should succeed in any case),
19223 MVT TruncVT = MVT::getIntegerVT(MaskVT.getSizeInBits());
19224 return DAG.getBitcast(MaskVT,
19225 DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Mask));
19229 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19230 Mask.getSimpleValueType().getSizeInBits());
19231 // In case when MaskVT equals v2i1 or v4i1, low 2 or 4 elements
19232 // are extracted by EXTRACT_SUBVECTOR.
19233 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MaskVT,
19234 DAG.getBitcast(BitcastVT, Mask),
19235 DAG.getIntPtrConstant(0, dl));
19239 /// \brief Return (and \p Op, \p Mask) for compare instructions or
19240 /// (vselect \p Mask, \p Op, \p PreservedSrc) for others along with the
19241 /// necessary casting or extending for \p Mask when lowering masking intrinsics
19242 static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
19243 SDValue PreservedSrc,
19244 const X86Subtarget &Subtarget,
19245 SelectionDAG &DAG) {
19246 MVT VT = Op.getSimpleValueType();
19247 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19248 unsigned OpcodeSelect = ISD::VSELECT;
19251 if (isAllOnesConstant(Mask))
19254 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
19256 switch (Op.getOpcode()) {
19258 case X86ISD::PCMPEQM:
19259 case X86ISD::PCMPGTM:
19261 case X86ISD::CMPMU:
19262 return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
19263 case X86ISD::VFPCLASS:
19264 case X86ISD::VFPCLASSS:
19265 return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
19266 case X86ISD::VTRUNC:
19267 case X86ISD::VTRUNCS:
19268 case X86ISD::VTRUNCUS:
19269 case X86ISD::CVTPS2PH:
19270 // We can't use ISD::VSELECT here because it is not always "Legal"
19271 // for the destination type. For example vpmovqb require only AVX512
19272 // and vselect that can operate on byte element type require BWI
19273 OpcodeSelect = X86ISD::SELECT;
19276 if (PreservedSrc.isUndef())
19277 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19278 return DAG.getNode(OpcodeSelect, dl, VT, VMask, Op, PreservedSrc);
19281 /// \brief Creates an SDNode for a predicated scalar operation.
19282 /// \returns (X86vselect \p Mask, \p Op, \p PreservedSrc).
19283 /// The mask is coming as MVT::i8 and it should be transformed
19284 /// to MVT::v1i1 while lowering masking intrinsics.
19285 /// The main difference between ScalarMaskingNode and VectorMaskingNode is using
19286 /// "X86select" instead of "vselect". We just can't create the "vselect" node
19287 /// for a scalar instruction.
19288 static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask,
19289 SDValue PreservedSrc,
19290 const X86Subtarget &Subtarget,
19291 SelectionDAG &DAG) {
19293 if (auto *MaskConst = dyn_cast<ConstantSDNode>(Mask))
19294 if (MaskConst->getZExtValue() & 0x1)
19297 MVT VT = Op.getSimpleValueType();
19300 SDValue IMask = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Mask);
19301 if (Op.getOpcode() == X86ISD::FSETCCM ||
19302 Op.getOpcode() == X86ISD::FSETCCM_RND)
19303 return DAG.getNode(ISD::AND, dl, VT, Op, IMask);
19304 if (Op.getOpcode() == X86ISD::VFPCLASSS)
19305 return DAG.getNode(ISD::OR, dl, VT, Op, IMask);
19307 if (PreservedSrc.isUndef())
19308 PreservedSrc = getZeroVector(VT, Subtarget, DAG, dl);
19309 return DAG.getNode(X86ISD::SELECTS, dl, VT, IMask, Op, PreservedSrc);
19312 static int getSEHRegistrationNodeSize(const Function *Fn) {
19313 if (!Fn->hasPersonalityFn())
19314 report_fatal_error(
19315 "querying registration node size for function without personality");
19316 // The RegNodeSize is 6 32-bit words for SEH and 4 for C++ EH. See
19317 // WinEHStatePass for the full struct definition.
19318 switch (classifyEHPersonality(Fn->getPersonalityFn())) {
19319 case EHPersonality::MSVC_X86SEH: return 24;
19320 case EHPersonality::MSVC_CXX: return 16;
19323 report_fatal_error(
19324 "can only recover FP for 32-bit MSVC EH personality functions");
19327 /// When the MSVC runtime transfers control to us, either to an outlined
19328 /// function or when returning to a parent frame after catching an exception, we
19329 /// recover the parent frame pointer by doing arithmetic on the incoming EBP.
19330 /// Here's the math:
19331 /// RegNodeBase = EntryEBP - RegNodeSize
19332 /// ParentFP = RegNodeBase - ParentFrameOffset
19333 /// Subtracting RegNodeSize takes us to the offset of the registration node, and
19334 /// subtracting the offset (negative on x86) takes us back to the parent FP.
19335 static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
19336 SDValue EntryEBP) {
19337 MachineFunction &MF = DAG.getMachineFunction();
19340 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
19341 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
19343 // It's possible that the parent function no longer has a personality function
19344 // if the exceptional code was optimized away, in which case we just return
19345 // the incoming EBP.
19346 if (!Fn->hasPersonalityFn())
19349 // Get an MCSymbol that will ultimately resolve to the frame offset of the EH
19350 // registration, or the .set_setframe offset.
19351 MCSymbol *OffsetSym =
19352 MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol(
19353 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
19354 SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT);
19355 SDValue ParentFrameOffset =
19356 DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal);
19358 // Return EntryEBP + ParentFrameOffset for x64. This adjusts from RSP after
19359 // prologue to RBP in the parent function.
19360 const X86Subtarget &Subtarget =
19361 static_cast<const X86Subtarget &>(DAG.getSubtarget());
19362 if (Subtarget.is64Bit())
19363 return DAG.getNode(ISD::ADD, dl, PtrVT, EntryEBP, ParentFrameOffset);
19365 int RegNodeSize = getSEHRegistrationNodeSize(Fn);
19366 // RegNodeBase = EntryEBP - RegNodeSize
19367 // ParentFP = RegNodeBase - ParentFrameOffset
19368 SDValue RegNodeBase = DAG.getNode(ISD::SUB, dl, PtrVT, EntryEBP,
19369 DAG.getConstant(RegNodeSize, dl, PtrVT));
19370 return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
19373 static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
19374 SelectionDAG &DAG) {
19375 // Helper to detect if the operand is CUR_DIRECTION rounding mode.
19376 auto isRoundModeCurDirection = [](SDValue Rnd) {
19377 if (!isa<ConstantSDNode>(Rnd))
19380 unsigned Round = cast<ConstantSDNode>(Rnd)->getZExtValue();
19381 return Round == X86::STATIC_ROUNDING::CUR_DIRECTION;
19385 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
19386 MVT VT = Op.getSimpleValueType();
19387 const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
19389 switch(IntrData->Type) {
19390 case INTR_TYPE_1OP:
19391 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
19392 case INTR_TYPE_2OP:
19393 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19395 case INTR_TYPE_3OP:
19396 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19397 Op.getOperand(2), Op.getOperand(3));
19398 case INTR_TYPE_4OP:
19399 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
19400 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
19401 case INTR_TYPE_1OP_MASK_RM: {
19402 SDValue Src = Op.getOperand(1);
19403 SDValue PassThru = Op.getOperand(2);
19404 SDValue Mask = Op.getOperand(3);
19405 SDValue RoundingMode;
19406 // We always add rounding mode to the Node.
19407 // If the rounding mode is not specified, we add the
19408 // "current direction" mode.
19409 if (Op.getNumOperands() == 4)
19411 DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19413 RoundingMode = Op.getOperand(4);
19414 assert(IntrData->Opc1 == 0 && "Unexpected second opcode!");
19415 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19417 Mask, PassThru, Subtarget, DAG);
19419 case INTR_TYPE_1OP_MASK: {
19420 SDValue Src = Op.getOperand(1);
19421 SDValue PassThru = Op.getOperand(2);
19422 SDValue Mask = Op.getOperand(3);
19423 // We add rounding mode to the Node when
19424 // - RM Opcode is specified and
19425 // - RM is not "current direction".
19426 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19427 if (IntrWithRoundingModeOpcode != 0) {
19428 SDValue Rnd = Op.getOperand(4);
19429 if (!isRoundModeCurDirection(Rnd)) {
19430 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19431 dl, Op.getValueType(),
19433 Mask, PassThru, Subtarget, DAG);
19436 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19437 Mask, PassThru, Subtarget, DAG);
19439 case INTR_TYPE_SCALAR_MASK: {
19440 SDValue Src1 = Op.getOperand(1);
19441 SDValue Src2 = Op.getOperand(2);
19442 SDValue passThru = Op.getOperand(3);
19443 SDValue Mask = Op.getOperand(4);
19444 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19445 if (IntrWithRoundingModeOpcode != 0) {
19446 SDValue Rnd = Op.getOperand(5);
19447 if (!isRoundModeCurDirection(Rnd))
19448 return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19449 dl, VT, Src1, Src2, Rnd),
19450 Mask, passThru, Subtarget, DAG);
19452 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
19453 Mask, passThru, Subtarget, DAG);
19455 case INTR_TYPE_SCALAR_MASK_RM: {
19456 SDValue Src1 = Op.getOperand(1);
19457 SDValue Src2 = Op.getOperand(2);
19458 SDValue Src0 = Op.getOperand(3);
19459 SDValue Mask = Op.getOperand(4);
19460 // There are 2 kinds of intrinsics in this group:
19461 // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
19462 // (2) With rounding mode and sae - 7 operands.
19463 if (Op.getNumOperands() == 6) {
19464 SDValue Sae = Op.getOperand(5);
19465 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19467 Mask, Src0, Subtarget, DAG);
19469 assert(Op.getNumOperands() == 7 && "Unexpected intrinsic form");
19470 SDValue RoundingMode = Op.getOperand(5);
19471 SDValue Sae = Op.getOperand(6);
19472 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2,
19473 RoundingMode, Sae),
19474 Mask, Src0, Subtarget, DAG);
19476 case INTR_TYPE_2OP_MASK:
19477 case INTR_TYPE_2OP_IMM8_MASK: {
19478 SDValue Src1 = Op.getOperand(1);
19479 SDValue Src2 = Op.getOperand(2);
19480 SDValue PassThru = Op.getOperand(3);
19481 SDValue Mask = Op.getOperand(4);
19483 if (IntrData->Type == INTR_TYPE_2OP_IMM8_MASK)
19484 Src2 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src2);
19486 // We specify 2 possible opcodes for intrinsics with rounding modes.
19487 // First, we check if the intrinsic may have non-default rounding mode,
19488 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19489 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19490 if (IntrWithRoundingModeOpcode != 0) {
19491 SDValue Rnd = Op.getOperand(5);
19492 if (!isRoundModeCurDirection(Rnd)) {
19493 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19494 dl, Op.getValueType(),
19496 Mask, PassThru, Subtarget, DAG);
19499 // TODO: Intrinsics should have fast-math-flags to propagate.
19500 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src1,Src2),
19501 Mask, PassThru, Subtarget, DAG);
19503 case INTR_TYPE_2OP_MASK_RM: {
19504 SDValue Src1 = Op.getOperand(1);
19505 SDValue Src2 = Op.getOperand(2);
19506 SDValue PassThru = Op.getOperand(3);
19507 SDValue Mask = Op.getOperand(4);
19508 // We specify 2 possible modes for intrinsics, with/without rounding
19510 // First, we check if the intrinsic have rounding mode (6 operands),
19511 // if not, we set rounding mode to "current".
19513 if (Op.getNumOperands() == 6)
19514 Rnd = Op.getOperand(5);
19516 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19517 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19519 Mask, PassThru, Subtarget, DAG);
19521 case INTR_TYPE_3OP_SCALAR_MASK_RM: {
19522 SDValue Src1 = Op.getOperand(1);
19523 SDValue Src2 = Op.getOperand(2);
19524 SDValue Src3 = Op.getOperand(3);
19525 SDValue PassThru = Op.getOperand(4);
19526 SDValue Mask = Op.getOperand(5);
19527 SDValue Sae = Op.getOperand(6);
19529 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
19531 Mask, PassThru, Subtarget, DAG);
19533 case INTR_TYPE_3OP_MASK_RM: {
19534 SDValue Src1 = Op.getOperand(1);
19535 SDValue Src2 = Op.getOperand(2);
19536 SDValue Imm = Op.getOperand(3);
19537 SDValue PassThru = Op.getOperand(4);
19538 SDValue Mask = Op.getOperand(5);
19539 // We specify 2 possible modes for intrinsics, with/without rounding
19541 // First, we check if the intrinsic have rounding mode (7 operands),
19542 // if not, we set rounding mode to "current".
19544 if (Op.getNumOperands() == 7)
19545 Rnd = Op.getOperand(6);
19547 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19548 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19549 Src1, Src2, Imm, Rnd),
19550 Mask, PassThru, Subtarget, DAG);
19552 case INTR_TYPE_3OP_IMM8_MASK:
19553 case INTR_TYPE_3OP_MASK: {
19554 SDValue Src1 = Op.getOperand(1);
19555 SDValue Src2 = Op.getOperand(2);
19556 SDValue Src3 = Op.getOperand(3);
19557 SDValue PassThru = Op.getOperand(4);
19558 SDValue Mask = Op.getOperand(5);
19560 if (IntrData->Type == INTR_TYPE_3OP_IMM8_MASK)
19561 Src3 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Src3);
19563 // We specify 2 possible opcodes for intrinsics with rounding modes.
19564 // First, we check if the intrinsic may have non-default rounding mode,
19565 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19566 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19567 if (IntrWithRoundingModeOpcode != 0) {
19568 SDValue Rnd = Op.getOperand(6);
19569 if (!isRoundModeCurDirection(Rnd)) {
19570 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19571 dl, Op.getValueType(),
19572 Src1, Src2, Src3, Rnd),
19573 Mask, PassThru, Subtarget, DAG);
19576 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19578 Mask, PassThru, Subtarget, DAG);
19580 case VPERM_2OP_MASK : {
19581 SDValue Src1 = Op.getOperand(1);
19582 SDValue Src2 = Op.getOperand(2);
19583 SDValue PassThru = Op.getOperand(3);
19584 SDValue Mask = Op.getOperand(4);
19586 // Swap Src1 and Src2 in the node creation
19587 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,Src2, Src1),
19588 Mask, PassThru, Subtarget, DAG);
19590 case VPERM_3OP_MASKZ:
19591 case VPERM_3OP_MASK:{
19592 MVT VT = Op.getSimpleValueType();
19593 // Src2 is the PassThru
19594 SDValue Src1 = Op.getOperand(1);
19595 // PassThru needs to be the same type as the destination in order
19596 // to pattern match correctly.
19597 SDValue Src2 = DAG.getBitcast(VT, Op.getOperand(2));
19598 SDValue Src3 = Op.getOperand(3);
19599 SDValue Mask = Op.getOperand(4);
19600 SDValue PassThru = SDValue();
19602 // set PassThru element
19603 if (IntrData->Type == VPERM_3OP_MASKZ)
19604 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19608 // Swap Src1 and Src2 in the node creation
19609 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19610 dl, Op.getValueType(),
19612 Mask, PassThru, Subtarget, DAG);
19616 case FMA_OP_MASK: {
19617 SDValue Src1 = Op.getOperand(1);
19618 SDValue Src2 = Op.getOperand(2);
19619 SDValue Src3 = Op.getOperand(3);
19620 SDValue Mask = Op.getOperand(4);
19621 MVT VT = Op.getSimpleValueType();
19622 SDValue PassThru = SDValue();
19624 // set PassThru element
19625 if (IntrData->Type == FMA_OP_MASKZ)
19626 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19627 else if (IntrData->Type == FMA_OP_MASK3)
19632 // We specify 2 possible opcodes for intrinsics with rounding modes.
19633 // First, we check if the intrinsic may have non-default rounding mode,
19634 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19635 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19636 if (IntrWithRoundingModeOpcode != 0) {
19637 SDValue Rnd = Op.getOperand(5);
19638 if (!isRoundModeCurDirection(Rnd))
19639 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19640 dl, Op.getValueType(),
19641 Src1, Src2, Src3, Rnd),
19642 Mask, PassThru, Subtarget, DAG);
19644 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
19645 dl, Op.getValueType(),
19647 Mask, PassThru, Subtarget, DAG);
19649 case FMA_OP_SCALAR_MASK:
19650 case FMA_OP_SCALAR_MASK3:
19651 case FMA_OP_SCALAR_MASKZ: {
19652 SDValue Src1 = Op.getOperand(1);
19653 SDValue Src2 = Op.getOperand(2);
19654 SDValue Src3 = Op.getOperand(3);
19655 SDValue Mask = Op.getOperand(4);
19656 MVT VT = Op.getSimpleValueType();
19657 SDValue PassThru = SDValue();
19659 // set PassThru element
19660 if (IntrData->Type == FMA_OP_SCALAR_MASKZ)
19661 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19662 else if (IntrData->Type == FMA_OP_SCALAR_MASK3)
19667 SDValue Rnd = Op.getOperand(5);
19668 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
19669 Op.getValueType(), Src1, Src2,
19671 Mask, PassThru, Subtarget, DAG);
19673 case TERLOG_OP_MASK:
19674 case TERLOG_OP_MASKZ: {
19675 SDValue Src1 = Op.getOperand(1);
19676 SDValue Src2 = Op.getOperand(2);
19677 SDValue Src3 = Op.getOperand(3);
19678 SDValue Src4 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(4));
19679 SDValue Mask = Op.getOperand(5);
19680 MVT VT = Op.getSimpleValueType();
19681 SDValue PassThru = Src1;
19682 // Set PassThru element.
19683 if (IntrData->Type == TERLOG_OP_MASKZ)
19684 PassThru = getZeroVector(VT, Subtarget, DAG, dl);
19686 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19687 Src1, Src2, Src3, Src4),
19688 Mask, PassThru, Subtarget, DAG);
19691 // ISD::FP_ROUND has a second argument that indicates if the truncation
19692 // does not change the value. Set it to 0 since it can change.
19693 return DAG.getNode(IntrData->Opc0, dl, VT, Op.getOperand(1),
19694 DAG.getIntPtrConstant(0, dl));
19695 case CVTPD2PS_MASK: {
19696 SDValue Src = Op.getOperand(1);
19697 SDValue PassThru = Op.getOperand(2);
19698 SDValue Mask = Op.getOperand(3);
19699 // We add rounding mode to the Node when
19700 // - RM Opcode is specified and
19701 // - RM is not "current direction".
19702 unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
19703 if (IntrWithRoundingModeOpcode != 0) {
19704 SDValue Rnd = Op.getOperand(4);
19705 if (!isRoundModeCurDirection(Rnd)) {
19706 return getVectorMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
19707 dl, Op.getValueType(),
19709 Mask, PassThru, Subtarget, DAG);
19712 assert(IntrData->Opc0 == ISD::FP_ROUND && "Unexpected opcode!");
19713 // ISD::FP_ROUND has a second argument that indicates if the truncation
19714 // does not change the value. Set it to 0 since it can change.
19715 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src,
19716 DAG.getIntPtrConstant(0, dl)),
19717 Mask, PassThru, Subtarget, DAG);
19720 // FPclass intrinsics with mask
19721 SDValue Src1 = Op.getOperand(1);
19722 MVT VT = Src1.getSimpleValueType();
19723 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19724 SDValue Imm = Op.getOperand(2);
19725 SDValue Mask = Op.getOperand(3);
19726 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19727 Mask.getSimpleValueType().getSizeInBits());
19728 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
19729 SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
19730 DAG.getTargetConstant(0, dl, MaskVT),
19732 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19733 DAG.getUNDEF(BitcastVT), FPclassMask,
19734 DAG.getIntPtrConstant(0, dl));
19735 return DAG.getBitcast(Op.getValueType(), Res);
19738 SDValue Src1 = Op.getOperand(1);
19739 SDValue Imm = Op.getOperand(2);
19740 SDValue Mask = Op.getOperand(3);
19741 SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
19742 SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
19743 DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
19744 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
19745 DAG.getIntPtrConstant(0, dl));
19748 case CMP_MASK_CC: {
19749 // Comparison intrinsics with masks.
19750 // Example of transformation:
19751 // (i8 (int_x86_avx512_mask_pcmpeq_q_128
19752 // (v2i64 %a), (v2i64 %b), (i8 %mask))) ->
19754 // (v8i1 (insert_subvector undef,
19755 // (v2i1 (and (PCMPEQM %a, %b),
19756 // (extract_subvector
19757 // (v8i1 (bitcast %mask)), 0))), 0))))
19758 MVT VT = Op.getOperand(1).getSimpleValueType();
19759 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
19760 SDValue Mask = Op.getOperand((IntrData->Type == CMP_MASK_CC) ? 4 : 3);
19761 MVT BitcastVT = MVT::getVectorVT(MVT::i1,
19762 Mask.getSimpleValueType().getSizeInBits());
19764 if (IntrData->Type == CMP_MASK_CC) {
19765 SDValue CC = Op.getOperand(3);
19766 CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, CC);
19767 // We specify 2 possible opcodes for intrinsics with rounding modes.
19768 // First, we check if the intrinsic may have non-default rounding mode,
19769 // (IntrData->Opc1 != 0), then we check the rounding mode operand.
19770 if (IntrData->Opc1 != 0) {
19771 SDValue Rnd = Op.getOperand(5);
19772 if (!isRoundModeCurDirection(Rnd))
19773 Cmp = DAG.getNode(IntrData->Opc1, dl, MaskVT, Op.getOperand(1),
19774 Op.getOperand(2), CC, Rnd);
19776 //default rounding mode
19778 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19779 Op.getOperand(2), CC);
19782 assert(IntrData->Type == CMP_MASK && "Unexpected intrinsic type!");
19783 Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
19786 SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
19787 DAG.getTargetConstant(0, dl,
19790 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19791 DAG.getUNDEF(BitcastVT), CmpMask,
19792 DAG.getIntPtrConstant(0, dl));
19793 return DAG.getBitcast(Op.getValueType(), Res);
19795 case CMP_MASK_SCALAR_CC: {
19796 SDValue Src1 = Op.getOperand(1);
19797 SDValue Src2 = Op.getOperand(2);
19798 SDValue CC = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op.getOperand(3));
19799 SDValue Mask = Op.getOperand(4);
19802 if (IntrData->Opc1 != 0) {
19803 SDValue Rnd = Op.getOperand(5);
19804 if (!isRoundModeCurDirection(Rnd))
19805 Cmp = DAG.getNode(IntrData->Opc1, dl, MVT::v1i1, Src1, Src2, CC, Rnd);
19807 //default rounding mode
19809 Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
19811 SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
19812 DAG.getTargetConstant(0, dl,
19815 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
19816 DAG.getIntPtrConstant(0, dl));
19818 case COMI: { // Comparison intrinsics
19819 ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
19820 SDValue LHS = Op.getOperand(1);
19821 SDValue RHS = Op.getOperand(2);
19822 SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
19823 SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
19826 case ISD::SETEQ: { // (ZF = 0 and PF = 0)
19827 SetCC = getSETCC(X86::COND_E, Comi, dl, DAG);
19828 SDValue SetNP = getSETCC(X86::COND_NP, Comi, dl, DAG);
19829 SetCC = DAG.getNode(ISD::AND, dl, MVT::i8, SetCC, SetNP);
19832 case ISD::SETNE: { // (ZF = 1 or PF = 1)
19833 SetCC = getSETCC(X86::COND_NE, Comi, dl, DAG);
19834 SDValue SetP = getSETCC(X86::COND_P, Comi, dl, DAG);
19835 SetCC = DAG.getNode(ISD::OR, dl, MVT::i8, SetCC, SetP);
19838 case ISD::SETGT: // (CF = 0 and ZF = 0)
19839 SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
19841 case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
19842 SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
19845 case ISD::SETGE: // CF = 0
19846 SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
19848 case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
19849 SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
19852 llvm_unreachable("Unexpected illegal condition!");
19854 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
19856 case COMI_RM: { // Comparison intrinsics with Sae
19857 SDValue LHS = Op.getOperand(1);
19858 SDValue RHS = Op.getOperand(2);
19859 unsigned CondVal = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
19860 SDValue Sae = Op.getOperand(4);
19863 if (isRoundModeCurDirection(Sae))
19864 FCmp = DAG.getNode(X86ISD::FSETCCM, dl, MVT::v1i1, LHS, RHS,
19865 DAG.getConstant(CondVal, dl, MVT::i8));
19867 FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
19868 DAG.getConstant(CondVal, dl, MVT::i8), Sae);
19869 return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
19870 DAG.getIntPtrConstant(0, dl));
19873 return getTargetVShiftNode(IntrData->Opc0, dl, Op.getSimpleValueType(),
19874 Op.getOperand(1), Op.getOperand(2), Subtarget,
19876 case COMPRESS_EXPAND_IN_REG: {
19877 SDValue Mask = Op.getOperand(3);
19878 SDValue DataToCompress = Op.getOperand(1);
19879 SDValue PassThru = Op.getOperand(2);
19880 if (isAllOnesConstant(Mask)) // return data as is
19881 return Op.getOperand(1);
19883 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19885 Mask, PassThru, Subtarget, DAG);
19888 SDValue Mask = Op.getOperand(1);
19889 MVT MaskVT = MVT::getVectorVT(MVT::i1,
19890 Mask.getSimpleValueType().getSizeInBits());
19891 Mask = DAG.getBitcast(MaskVT, Mask);
19892 return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
19895 MVT VT = Op.getSimpleValueType();
19896 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
19898 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19899 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19900 // Arguments should be swapped.
19901 SDValue Res = DAG.getNode(IntrData->Opc0, dl,
19902 MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
19904 return DAG.getBitcast(VT, Res);
19907 MVT VT = Op.getSimpleValueType();
19908 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19910 SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
19911 SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
19912 SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
19913 return DAG.getBitcast(VT, Res);
19916 case FIXUPIMMS_MASKZ:
19918 case FIXUPIMM_MASKZ:{
19919 SDValue Src1 = Op.getOperand(1);
19920 SDValue Src2 = Op.getOperand(2);
19921 SDValue Src3 = Op.getOperand(3);
19922 SDValue Imm = Op.getOperand(4);
19923 SDValue Mask = Op.getOperand(5);
19924 SDValue Passthru = (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMMS ) ?
19925 Src1 : getZeroVector(VT, Subtarget, DAG, dl);
19926 // We specify 2 possible modes for intrinsics, with/without rounding
19928 // First, we check if the intrinsic have rounding mode (7 operands),
19929 // if not, we set rounding mode to "current".
19931 if (Op.getNumOperands() == 7)
19932 Rnd = Op.getOperand(6);
19934 Rnd = DAG.getConstant(X86::STATIC_ROUNDING::CUR_DIRECTION, dl, MVT::i32);
19935 if (IntrData->Type == FIXUPIMM || IntrData->Type == FIXUPIMM_MASKZ)
19936 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19937 Src1, Src2, Src3, Imm, Rnd),
19938 Mask, Passthru, Subtarget, DAG);
19939 else // Scalar - FIXUPIMMS, FIXUPIMMS_MASKZ
19940 return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19941 Src1, Src2, Src3, Imm, Rnd),
19942 Mask, Passthru, Subtarget, DAG);
19944 case CONVERT_TO_MASK: {
19945 MVT SrcVT = Op.getOperand(1).getSimpleValueType();
19946 MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
19947 MVT BitcastVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
19949 SDValue CvtMask = DAG.getNode(IntrData->Opc0, dl, MaskVT,
19951 SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
19952 DAG.getUNDEF(BitcastVT), CvtMask,
19953 DAG.getIntPtrConstant(0, dl));
19954 return DAG.getBitcast(Op.getValueType(), Res);
19956 case BRCST_SUBVEC_TO_VEC: {
19957 SDValue Src = Op.getOperand(1);
19958 SDValue Passthru = Op.getOperand(2);
19959 SDValue Mask = Op.getOperand(3);
19960 EVT resVT = Passthru.getValueType();
19961 SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
19962 DAG.getUNDEF(resVT), Src,
19963 DAG.getIntPtrConstant(0, dl));
19965 if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
19966 immVal = DAG.getConstant(0x44, dl, MVT::i8);
19968 immVal = DAG.getConstant(0, dl, MVT::i8);
19969 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
19970 subVec, subVec, immVal),
19971 Mask, Passthru, Subtarget, DAG);
19973 case BRCST32x2_TO_VEC: {
19974 SDValue Src = Op.getOperand(1);
19975 SDValue PassThru = Op.getOperand(2);
19976 SDValue Mask = Op.getOperand(3);
19978 assert((VT.getScalarType() == MVT::i32 ||
19979 VT.getScalarType() == MVT::f32) && "Unexpected type!");
19980 //bitcast Src to packed 64
19981 MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
19982 MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
19983 Src = DAG.getBitcast(BitcastVT, Src);
19985 return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
19986 Mask, PassThru, Subtarget, DAG);
19994 default: return SDValue(); // Don't custom lower most intrinsics.
19996 case Intrinsic::x86_avx2_permd:
19997 case Intrinsic::x86_avx2_permps:
19998 // Operands intentionally swapped. Mask is last operand to intrinsic,
19999 // but second operand for node/instruction.
20000 return DAG.getNode(X86ISD::VPERMV, dl, Op.getValueType(),
20001 Op.getOperand(2), Op.getOperand(1));
20003 // ptest and testp intrinsics. The intrinsic these come from are designed to
20004 // return an integer value, not just an instruction so lower it to the ptest
20005 // or testp pattern and a setcc for the result.
20006 case Intrinsic::x86_sse41_ptestz:
20007 case Intrinsic::x86_sse41_ptestc:
20008 case Intrinsic::x86_sse41_ptestnzc:
20009 case Intrinsic::x86_avx_ptestz_256:
20010 case Intrinsic::x86_avx_ptestc_256:
20011 case Intrinsic::x86_avx_ptestnzc_256:
20012 case Intrinsic::x86_avx_vtestz_ps:
20013 case Intrinsic::x86_avx_vtestc_ps:
20014 case Intrinsic::x86_avx_vtestnzc_ps:
20015 case Intrinsic::x86_avx_vtestz_pd:
20016 case Intrinsic::x86_avx_vtestc_pd:
20017 case Intrinsic::x86_avx_vtestnzc_pd:
20018 case Intrinsic::x86_avx_vtestz_ps_256:
20019 case Intrinsic::x86_avx_vtestc_ps_256:
20020 case Intrinsic::x86_avx_vtestnzc_ps_256:
20021 case Intrinsic::x86_avx_vtestz_pd_256:
20022 case Intrinsic::x86_avx_vtestc_pd_256:
20023 case Intrinsic::x86_avx_vtestnzc_pd_256: {
20024 bool IsTestPacked = false;
20025 X86::CondCode X86CC;
20027 default: llvm_unreachable("Bad fallthrough in Intrinsic lowering.");
20028 case Intrinsic::x86_avx_vtestz_ps:
20029 case Intrinsic::x86_avx_vtestz_pd:
20030 case Intrinsic::x86_avx_vtestz_ps_256:
20031 case Intrinsic::x86_avx_vtestz_pd_256:
20032 IsTestPacked = true;
20034 case Intrinsic::x86_sse41_ptestz:
20035 case Intrinsic::x86_avx_ptestz_256:
20037 X86CC = X86::COND_E;
20039 case Intrinsic::x86_avx_vtestc_ps:
20040 case Intrinsic::x86_avx_vtestc_pd:
20041 case Intrinsic::x86_avx_vtestc_ps_256:
20042 case Intrinsic::x86_avx_vtestc_pd_256:
20043 IsTestPacked = true;
20045 case Intrinsic::x86_sse41_ptestc:
20046 case Intrinsic::x86_avx_ptestc_256:
20048 X86CC = X86::COND_B;
20050 case Intrinsic::x86_avx_vtestnzc_ps:
20051 case Intrinsic::x86_avx_vtestnzc_pd:
20052 case Intrinsic::x86_avx_vtestnzc_ps_256:
20053 case Intrinsic::x86_avx_vtestnzc_pd_256:
20054 IsTestPacked = true;
20056 case Intrinsic::x86_sse41_ptestnzc:
20057 case Intrinsic::x86_avx_ptestnzc_256:
20059 X86CC = X86::COND_A;
20063 SDValue LHS = Op.getOperand(1);
20064 SDValue RHS = Op.getOperand(2);
20065 unsigned TestOpc = IsTestPacked ? X86ISD::TESTP : X86ISD::PTEST;
20066 SDValue Test = DAG.getNode(TestOpc, dl, MVT::i32, LHS, RHS);
20067 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20068 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20070 case Intrinsic::x86_avx512_kortestz_w:
20071 case Intrinsic::x86_avx512_kortestc_w: {
20072 X86::CondCode X86CC =
20073 (IntNo == Intrinsic::x86_avx512_kortestz_w) ? X86::COND_E : X86::COND_B;
20074 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20075 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20076 SDValue Test = DAG.getNode(X86ISD::KORTEST, dl, MVT::i32, LHS, RHS);
20077 SDValue SetCC = getSETCC(X86CC, Test, dl, DAG);
20078 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20081 case Intrinsic::x86_avx512_knot_w: {
20082 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20083 SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
20084 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20085 return DAG.getBitcast(MVT::i16, Res);
20088 case Intrinsic::x86_avx512_kandn_w: {
20089 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20090 // Invert LHS for the not.
20091 LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
20092 DAG.getConstant(1, dl, MVT::v16i1));
20093 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20094 SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
20095 return DAG.getBitcast(MVT::i16, Res);
20098 case Intrinsic::x86_avx512_kxnor_w: {
20099 SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
20100 SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
20101 SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
20102 // Invert result for the not.
20103 Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
20104 DAG.getConstant(1, dl, MVT::v16i1));
20105 return DAG.getBitcast(MVT::i16, Res);
20108 case Intrinsic::x86_sse42_pcmpistria128:
20109 case Intrinsic::x86_sse42_pcmpestria128:
20110 case Intrinsic::x86_sse42_pcmpistric128:
20111 case Intrinsic::x86_sse42_pcmpestric128:
20112 case Intrinsic::x86_sse42_pcmpistrio128:
20113 case Intrinsic::x86_sse42_pcmpestrio128:
20114 case Intrinsic::x86_sse42_pcmpistris128:
20115 case Intrinsic::x86_sse42_pcmpestris128:
20116 case Intrinsic::x86_sse42_pcmpistriz128:
20117 case Intrinsic::x86_sse42_pcmpestriz128: {
20119 X86::CondCode X86CC;
20121 default: llvm_unreachable("Impossible intrinsic"); // Can't reach here.
20122 case Intrinsic::x86_sse42_pcmpistria128:
20123 Opcode = X86ISD::PCMPISTRI;
20124 X86CC = X86::COND_A;
20126 case Intrinsic::x86_sse42_pcmpestria128:
20127 Opcode = X86ISD::PCMPESTRI;
20128 X86CC = X86::COND_A;
20130 case Intrinsic::x86_sse42_pcmpistric128:
20131 Opcode = X86ISD::PCMPISTRI;
20132 X86CC = X86::COND_B;
20134 case Intrinsic::x86_sse42_pcmpestric128:
20135 Opcode = X86ISD::PCMPESTRI;
20136 X86CC = X86::COND_B;
20138 case Intrinsic::x86_sse42_pcmpistrio128:
20139 Opcode = X86ISD::PCMPISTRI;
20140 X86CC = X86::COND_O;
20142 case Intrinsic::x86_sse42_pcmpestrio128:
20143 Opcode = X86ISD::PCMPESTRI;
20144 X86CC = X86::COND_O;
20146 case Intrinsic::x86_sse42_pcmpistris128:
20147 Opcode = X86ISD::PCMPISTRI;
20148 X86CC = X86::COND_S;
20150 case Intrinsic::x86_sse42_pcmpestris128:
20151 Opcode = X86ISD::PCMPESTRI;
20152 X86CC = X86::COND_S;
20154 case Intrinsic::x86_sse42_pcmpistriz128:
20155 Opcode = X86ISD::PCMPISTRI;
20156 X86CC = X86::COND_E;
20158 case Intrinsic::x86_sse42_pcmpestriz128:
20159 Opcode = X86ISD::PCMPESTRI;
20160 X86CC = X86::COND_E;
20163 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20164 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20165 SDValue PCMP = DAG.getNode(Opcode, dl, VTs, NewOps);
20166 SDValue SetCC = getSETCC(X86CC, SDValue(PCMP.getNode(), 1), dl, DAG);
20167 return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
20170 case Intrinsic::x86_sse42_pcmpistri128:
20171 case Intrinsic::x86_sse42_pcmpestri128: {
20173 if (IntNo == Intrinsic::x86_sse42_pcmpistri128)
20174 Opcode = X86ISD::PCMPISTRI;
20176 Opcode = X86ISD::PCMPESTRI;
20178 SmallVector<SDValue, 5> NewOps(Op->op_begin()+1, Op->op_end());
20179 SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
20180 return DAG.getNode(Opcode, dl, VTs, NewOps);
20183 case Intrinsic::eh_sjlj_lsda: {
20184 MachineFunction &MF = DAG.getMachineFunction();
20185 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
20186 MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
20187 auto &Context = MF.getMMI().getContext();
20188 MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
20189 Twine(MF.getFunctionNumber()));
20190 return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
20193 case Intrinsic::x86_seh_lsda: {
20194 // Compute the symbol for the LSDA. We know it'll get emitted later.
20195 MachineFunction &MF = DAG.getMachineFunction();
20196 SDValue Op1 = Op.getOperand(1);
20197 auto *Fn = cast<Function>(cast<GlobalAddressSDNode>(Op1)->getGlobal());
20198 MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol(
20199 GlobalValue::dropLLVMManglingEscape(Fn->getName()));
20201 // Generate a simple absolute symbol reference. This intrinsic is only
20202 // supported on 32-bit Windows, which isn't PIC.
20203 SDValue Result = DAG.getMCSymbol(LSDASym, VT);
20204 return DAG.getNode(X86ISD::Wrapper, dl, VT, Result);
20207 case Intrinsic::x86_seh_recoverfp: {
20208 SDValue FnOp = Op.getOperand(1);
20209 SDValue IncomingFPOp = Op.getOperand(2);
20210 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
20211 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
20213 report_fatal_error(
20214 "llvm.x86.seh.recoverfp must take a function as the first argument");
20215 return recoverFramePointer(DAG, Fn, IncomingFPOp);
20218 case Intrinsic::localaddress: {
20219 // Returns one of the stack, base, or frame pointer registers, depending on
20220 // which is used to reference local variables.
20221 MachineFunction &MF = DAG.getMachineFunction();
20222 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20224 if (RegInfo->hasBasePointer(MF))
20225 Reg = RegInfo->getBaseRegister();
20226 else // This function handles the SP or FP case.
20227 Reg = RegInfo->getPtrSizedFrameRegister(MF);
20228 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
20233 static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20234 SDValue Src, SDValue Mask, SDValue Base,
20235 SDValue Index, SDValue ScaleOp, SDValue Chain,
20236 const X86Subtarget &Subtarget) {
20238 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20239 // Scale must be constant.
20242 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20243 EVT MaskVT = Mask.getValueType();
20244 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20245 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20246 SDValue Segment = DAG.getRegister(0, MVT::i32);
20247 // If source is undef or we know it won't be used, use a zero vector
20248 // to break register dependency.
20249 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20250 if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
20251 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20252 SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
20253 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20254 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20255 return DAG.getMergeValues(RetOps, dl);
20258 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20259 SDValue Src, SDValue Mask, SDValue Base,
20260 SDValue Index, SDValue ScaleOp, SDValue Chain,
20261 const X86Subtarget &Subtarget) {
20263 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20264 // Scale must be constant.
20267 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20268 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20269 Index.getSimpleValueType().getVectorNumElements());
20271 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20272 SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
20273 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20274 SDValue Segment = DAG.getRegister(0, MVT::i32);
20275 // If source is undef or we know it won't be used, use a zero vector
20276 // to break register dependency.
20277 // TODO: use undef instead and let ExecutionDepsFix deal with it?
20278 if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
20279 Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
20280 SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
20281 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20282 SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
20283 return DAG.getMergeValues(RetOps, dl);
20286 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20287 SDValue Src, SDValue Mask, SDValue Base,
20288 SDValue Index, SDValue ScaleOp, SDValue Chain,
20289 const X86Subtarget &Subtarget) {
20291 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20292 // Scale must be constant.
20295 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20296 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20297 SDValue Segment = DAG.getRegister(0, MVT::i32);
20298 MVT MaskVT = MVT::getVectorVT(MVT::i1,
20299 Index.getSimpleValueType().getVectorNumElements());
20301 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20302 SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
20303 SDValue Ops[] = {Base, Scale, Index, Disp, Segment, VMask, Src, Chain};
20304 SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
20305 return SDValue(Res, 1);
20308 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
20309 SDValue Mask, SDValue Base, SDValue Index,
20310 SDValue ScaleOp, SDValue Chain,
20311 const X86Subtarget &Subtarget) {
20313 auto *C = dyn_cast<ConstantSDNode>(ScaleOp);
20314 // Scale must be constant.
20317 SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
20318 SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
20319 SDValue Segment = DAG.getRegister(0, MVT::i32);
20321 MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
20322 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20323 SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
20324 SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
20325 return SDValue(Res, 0);
20328 /// Handles the lowering of builtin intrinsic that return the value
20329 /// of the extended control register.
20330 static void getExtendedControlRegister(SDNode *N, const SDLoc &DL,
20332 const X86Subtarget &Subtarget,
20333 SmallVectorImpl<SDValue> &Results) {
20334 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20335 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20338 // The ECX register is used to select the index of the XCR register to
20341 DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX, N->getOperand(2));
20342 SDNode *N1 = DAG.getMachineNode(X86::XGETBV, DL, Tys, Chain);
20343 Chain = SDValue(N1, 0);
20345 // Reads the content of XCR and returns it in registers EDX:EAX.
20346 if (Subtarget.is64Bit()) {
20347 LO = DAG.getCopyFromReg(Chain, DL, X86::RAX, MVT::i64, SDValue(N1, 1));
20348 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20351 LO = DAG.getCopyFromReg(Chain, DL, X86::EAX, MVT::i32, SDValue(N1, 1));
20352 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20355 Chain = HI.getValue(1);
20357 if (Subtarget.is64Bit()) {
20358 // Merge the two 32-bit values into a 64-bit one..
20359 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20360 DAG.getConstant(32, DL, MVT::i8));
20361 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20362 Results.push_back(Chain);
20366 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20367 SDValue Ops[] = { LO, HI };
20368 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20369 Results.push_back(Pair);
20370 Results.push_back(Chain);
20373 /// Handles the lowering of builtin intrinsics that read performance monitor
20374 /// counters (x86_rdpmc).
20375 static void getReadPerformanceCounter(SDNode *N, const SDLoc &DL,
20377 const X86Subtarget &Subtarget,
20378 SmallVectorImpl<SDValue> &Results) {
20379 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20380 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20383 // The ECX register is used to select the index of the performance counter
20385 SDValue Chain = DAG.getCopyToReg(N->getOperand(0), DL, X86::ECX,
20387 SDValue rd = DAG.getNode(X86ISD::RDPMC_DAG, DL, Tys, Chain);
20389 // Reads the content of a 64-bit performance counter and returns it in the
20390 // registers EDX:EAX.
20391 if (Subtarget.is64Bit()) {
20392 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20393 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20396 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20397 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20400 Chain = HI.getValue(1);
20402 if (Subtarget.is64Bit()) {
20403 // The EAX register is loaded with the low-order 32 bits. The EDX register
20404 // is loaded with the supported high-order bits of the counter.
20405 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20406 DAG.getConstant(32, DL, MVT::i8));
20407 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20408 Results.push_back(Chain);
20412 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20413 SDValue Ops[] = { LO, HI };
20414 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20415 Results.push_back(Pair);
20416 Results.push_back(Chain);
20419 /// Handles the lowering of builtin intrinsics that read the time stamp counter
20420 /// (x86_rdtsc and x86_rdtscp). This function is also used to custom lower
20421 /// READCYCLECOUNTER nodes.
20422 static void getReadTimeStampCounter(SDNode *N, const SDLoc &DL, unsigned Opcode,
20424 const X86Subtarget &Subtarget,
20425 SmallVectorImpl<SDValue> &Results) {
20426 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
20427 SDValue rd = DAG.getNode(Opcode, DL, Tys, N->getOperand(0));
20430 // The processor's time-stamp counter (a 64-bit MSR) is stored into the
20431 // EDX:EAX registers. EDX is loaded with the high-order 32 bits of the MSR
20432 // and the EAX register is loaded with the low-order 32 bits.
20433 if (Subtarget.is64Bit()) {
20434 LO = DAG.getCopyFromReg(rd, DL, X86::RAX, MVT::i64, rd.getValue(1));
20435 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::RDX, MVT::i64,
20438 LO = DAG.getCopyFromReg(rd, DL, X86::EAX, MVT::i32, rd.getValue(1));
20439 HI = DAG.getCopyFromReg(LO.getValue(1), DL, X86::EDX, MVT::i32,
20442 SDValue Chain = HI.getValue(1);
20444 if (Opcode == X86ISD::RDTSCP_DAG) {
20445 assert(N->getNumOperands() == 3 && "Unexpected number of operands!");
20447 // Instruction RDTSCP loads the IA32:TSC_AUX_MSR (address C000_0103H) into
20448 // the ECX register. Add 'ecx' explicitly to the chain.
20449 SDValue ecx = DAG.getCopyFromReg(Chain, DL, X86::ECX, MVT::i32,
20451 // Explicitly store the content of ECX at the location passed in input
20452 // to the 'rdtscp' intrinsic.
20453 Chain = DAG.getStore(ecx.getValue(1), DL, ecx, N->getOperand(2),
20454 MachinePointerInfo());
20457 if (Subtarget.is64Bit()) {
20458 // The EDX register is loaded with the high-order 32 bits of the MSR, and
20459 // the EAX register is loaded with the low-order 32 bits.
20460 SDValue Tmp = DAG.getNode(ISD::SHL, DL, MVT::i64, HI,
20461 DAG.getConstant(32, DL, MVT::i8));
20462 Results.push_back(DAG.getNode(ISD::OR, DL, MVT::i64, LO, Tmp));
20463 Results.push_back(Chain);
20467 // Use a buildpair to merge the two 32-bit values into a 64-bit one.
20468 SDValue Ops[] = { LO, HI };
20469 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Ops);
20470 Results.push_back(Pair);
20471 Results.push_back(Chain);
20474 static SDValue LowerREADCYCLECOUNTER(SDValue Op, const X86Subtarget &Subtarget,
20475 SelectionDAG &DAG) {
20476 SmallVector<SDValue, 2> Results;
20478 getReadTimeStampCounter(Op.getNode(), DL, X86ISD::RDTSC_DAG, DAG, Subtarget,
20480 return DAG.getMergeValues(Results, DL);
20483 static SDValue MarkEHRegistrationNode(SDValue Op, SelectionDAG &DAG) {
20484 MachineFunction &MF = DAG.getMachineFunction();
20485 SDValue Chain = Op.getOperand(0);
20486 SDValue RegNode = Op.getOperand(2);
20487 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20489 report_fatal_error("EH registrations only live in functions using WinEH");
20491 // Cast the operand to an alloca, and remember the frame index.
20492 auto *FINode = dyn_cast<FrameIndexSDNode>(RegNode);
20494 report_fatal_error("llvm.x86.seh.ehregnode expects a static alloca");
20495 EHInfo->EHRegNodeFrameIndex = FINode->getIndex();
20497 // Return the chain operand without making any DAG nodes.
20501 static SDValue MarkEHGuard(SDValue Op, SelectionDAG &DAG) {
20502 MachineFunction &MF = DAG.getMachineFunction();
20503 SDValue Chain = Op.getOperand(0);
20504 SDValue EHGuard = Op.getOperand(2);
20505 WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo();
20507 report_fatal_error("EHGuard only live in functions using WinEH");
20509 // Cast the operand to an alloca, and remember the frame index.
20510 auto *FINode = dyn_cast<FrameIndexSDNode>(EHGuard);
20512 report_fatal_error("llvm.x86.seh.ehguard expects a static alloca");
20513 EHInfo->EHGuardFrameIndex = FINode->getIndex();
20515 // Return the chain operand without making any DAG nodes.
20519 /// Emit Truncating Store with signed or unsigned saturation.
20521 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
20522 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
20523 SelectionDAG &DAG) {
20525 SDVTList VTs = DAG.getVTList(MVT::Other);
20526 SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
20527 SDValue Ops[] = { Chain, Val, Ptr, Undef };
20529 DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20530 DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20533 /// Emit Masked Truncating Store with signed or unsigned saturation.
20535 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
20536 SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
20537 MachineMemOperand *MMO, SelectionDAG &DAG) {
20539 SDVTList VTs = DAG.getVTList(MVT::Other);
20540 SDValue Ops[] = { Chain, Ptr, Mask, Val };
20542 DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
20543 DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
20546 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
20547 SelectionDAG &DAG) {
20548 unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
20550 const IntrinsicData *IntrData = getIntrinsicWithChain(IntNo);
20553 case llvm::Intrinsic::x86_seh_ehregnode:
20554 return MarkEHRegistrationNode(Op, DAG);
20555 case llvm::Intrinsic::x86_seh_ehguard:
20556 return MarkEHGuard(Op, DAG);
20557 case llvm::Intrinsic::x86_flags_read_u32:
20558 case llvm::Intrinsic::x86_flags_read_u64:
20559 case llvm::Intrinsic::x86_flags_write_u32:
20560 case llvm::Intrinsic::x86_flags_write_u64: {
20561 // We need a frame pointer because this will get lowered to a PUSH/POP
20563 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20564 MFI.setHasCopyImplyingStackAdjustment(true);
20565 // Don't do anything here, we will expand these intrinsics out later
20566 // during ExpandISelPseudos in EmitInstrWithCustomInserter.
20569 case Intrinsic::x86_lwpins32:
20570 case Intrinsic::x86_lwpins64: {
20572 SDValue Chain = Op->getOperand(0);
20573 SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
20575 DAG.getNode(X86ISD::LWPINS, dl, VTs, Chain, Op->getOperand(2),
20576 Op->getOperand(3), Op->getOperand(4));
20577 SDValue SetCC = getSETCC(X86::COND_B, LwpIns.getValue(0), dl, DAG);
20578 SDValue Result = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i8, SetCC);
20579 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result,
20580 LwpIns.getValue(1));
20587 switch(IntrData->Type) {
20588 default: llvm_unreachable("Unknown Intrinsic Type");
20591 // Emit the node with the right value type.
20592 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
20593 SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20595 // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
20596 // Otherwise return the value from Rand, which is always 0, casted to i32.
20597 SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
20598 DAG.getConstant(1, dl, Op->getValueType(1)),
20599 DAG.getConstant(X86::COND_B, dl, MVT::i32),
20600 SDValue(Result.getNode(), 1) };
20601 SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
20602 DAG.getVTList(Op->getValueType(1), MVT::Glue),
20605 // Return { result, isValid, chain }.
20606 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
20607 SDValue(Result.getNode(), 2));
20609 case GATHER_AVX2: {
20610 SDValue Chain = Op.getOperand(0);
20611 SDValue Src = Op.getOperand(2);
20612 SDValue Base = Op.getOperand(3);
20613 SDValue Index = Op.getOperand(4);
20614 SDValue Mask = Op.getOperand(5);
20615 SDValue Scale = Op.getOperand(6);
20616 return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20617 Scale, Chain, Subtarget);
20620 //gather(v1, mask, index, base, scale);
20621 SDValue Chain = Op.getOperand(0);
20622 SDValue Src = Op.getOperand(2);
20623 SDValue Base = Op.getOperand(3);
20624 SDValue Index = Op.getOperand(4);
20625 SDValue Mask = Op.getOperand(5);
20626 SDValue Scale = Op.getOperand(6);
20627 return getGatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index, Scale,
20631 //scatter(base, mask, index, v1, scale);
20632 SDValue Chain = Op.getOperand(0);
20633 SDValue Base = Op.getOperand(2);
20634 SDValue Mask = Op.getOperand(3);
20635 SDValue Index = Op.getOperand(4);
20636 SDValue Src = Op.getOperand(5);
20637 SDValue Scale = Op.getOperand(6);
20638 return getScatterNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
20639 Scale, Chain, Subtarget);
20642 SDValue Hint = Op.getOperand(6);
20643 unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
20644 assert((HintVal == 2 || HintVal == 3) &&
20645 "Wrong prefetch hint in intrinsic: should be 2 or 3");
20646 unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
20647 SDValue Chain = Op.getOperand(0);
20648 SDValue Mask = Op.getOperand(2);
20649 SDValue Index = Op.getOperand(3);
20650 SDValue Base = Op.getOperand(4);
20651 SDValue Scale = Op.getOperand(5);
20652 return getPrefetchNode(Opcode, Op, DAG, Mask, Base, Index, Scale, Chain,
20655 // Read Time Stamp Counter (RDTSC) and Processor ID (RDTSCP).
20657 SmallVector<SDValue, 2> Results;
20658 getReadTimeStampCounter(Op.getNode(), dl, IntrData->Opc0, DAG, Subtarget,
20660 return DAG.getMergeValues(Results, dl);
20662 // Read Performance Monitoring Counters.
20664 SmallVector<SDValue, 2> Results;
20665 getReadPerformanceCounter(Op.getNode(), dl, DAG, Subtarget, Results);
20666 return DAG.getMergeValues(Results, dl);
20668 // Get Extended Control Register.
20670 SmallVector<SDValue, 2> Results;
20671 getExtendedControlRegister(Op.getNode(), dl, DAG, Subtarget, Results);
20672 return DAG.getMergeValues(Results, dl);
20674 // XTEST intrinsics.
20676 SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Other);
20677 SDValue InTrans = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
20679 SDValue SetCC = getSETCC(X86::COND_NE, InTrans, dl, DAG);
20680 SDValue Ret = DAG.getNode(ISD::ZERO_EXTEND, dl, Op->getValueType(0), SetCC);
20681 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20682 Ret, SDValue(InTrans.getNode(), 1));
20686 SDVTList CFVTs = DAG.getVTList(Op->getValueType(0), MVT::i32);
20687 SDVTList VTs = DAG.getVTList(Op.getOperand(3)->getValueType(0), MVT::i32);
20688 SDValue GenCF = DAG.getNode(X86ISD::ADD, dl, CFVTs, Op.getOperand(2),
20689 DAG.getConstant(-1, dl, MVT::i8));
20690 SDValue Res = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(3),
20691 Op.getOperand(4), GenCF.getValue(1));
20692 SDValue Store = DAG.getStore(Op.getOperand(0), dl, Res.getValue(0),
20693 Op.getOperand(5), MachinePointerInfo());
20694 SDValue SetCC = getSETCC(X86::COND_B, Res.getValue(1), dl, DAG);
20695 SDValue Results[] = { SetCC, Store };
20696 return DAG.getMergeValues(Results, dl);
20698 case COMPRESS_TO_MEM: {
20699 SDValue Mask = Op.getOperand(4);
20700 SDValue DataToCompress = Op.getOperand(3);
20701 SDValue Addr = Op.getOperand(2);
20702 SDValue Chain = Op.getOperand(0);
20703 MVT VT = DataToCompress.getSimpleValueType();
20705 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20706 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20708 if (isAllOnesConstant(Mask)) // return just a store
20709 return DAG.getStore(Chain, dl, DataToCompress, Addr,
20710 MemIntr->getMemOperand());
20712 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20713 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20715 return DAG.getMaskedStore(Chain, dl, DataToCompress, Addr, VMask, VT,
20716 MemIntr->getMemOperand(),
20717 false /* truncating */, true /* compressing */);
20719 case TRUNCATE_TO_MEM_VI8:
20720 case TRUNCATE_TO_MEM_VI16:
20721 case TRUNCATE_TO_MEM_VI32: {
20722 SDValue Mask = Op.getOperand(4);
20723 SDValue DataToTruncate = Op.getOperand(3);
20724 SDValue Addr = Op.getOperand(2);
20725 SDValue Chain = Op.getOperand(0);
20727 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20728 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20730 EVT MemVT = MemIntr->getMemoryVT();
20732 uint16_t TruncationOp = IntrData->Opc0;
20733 switch (TruncationOp) {
20734 case X86ISD::VTRUNC: {
20735 if (isAllOnesConstant(Mask)) // return just a truncate store
20736 return DAG.getTruncStore(Chain, dl, DataToTruncate, Addr, MemVT,
20737 MemIntr->getMemOperand());
20739 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20740 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20742 return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
20743 MemIntr->getMemOperand(), true /* truncating */);
20745 case X86ISD::VTRUNCUS:
20746 case X86ISD::VTRUNCS: {
20747 bool IsSigned = (TruncationOp == X86ISD::VTRUNCS);
20748 if (isAllOnesConstant(Mask))
20749 return EmitTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr, MemVT,
20750 MemIntr->getMemOperand(), DAG);
20752 MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
20753 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20755 return EmitMaskedTruncSStore(IsSigned, Chain, dl, DataToTruncate, Addr,
20756 VMask, MemVT, MemIntr->getMemOperand(), DAG);
20759 llvm_unreachable("Unsupported truncstore intrinsic");
20763 case EXPAND_FROM_MEM: {
20764 SDValue Mask = Op.getOperand(4);
20765 SDValue PassThru = Op.getOperand(3);
20766 SDValue Addr = Op.getOperand(2);
20767 SDValue Chain = Op.getOperand(0);
20768 MVT VT = Op.getSimpleValueType();
20770 MemIntrinsicSDNode *MemIntr = dyn_cast<MemIntrinsicSDNode>(Op);
20771 assert(MemIntr && "Expected MemIntrinsicSDNode!");
20773 if (isAllOnesConstant(Mask)) // Return a regular (unmasked) vector load.
20774 return DAG.getLoad(VT, dl, Chain, Addr, MemIntr->getMemOperand());
20775 if (X86::isZeroNode(Mask))
20776 return DAG.getUNDEF(VT);
20778 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
20779 SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
20780 return DAG.getMaskedLoad(VT, dl, Chain, Addr, VMask, PassThru, VT,
20781 MemIntr->getMemOperand(), ISD::NON_EXTLOAD,
20782 true /* expanding */);
20787 SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
20788 SelectionDAG &DAG) const {
20789 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
20790 MFI.setReturnAddressIsTaken(true);
20792 if (verifyReturnAddressArgumentIsConstant(Op, DAG))
20795 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20797 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20800 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
20801 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20802 SDValue Offset = DAG.getConstant(RegInfo->getSlotSize(), dl, PtrVT);
20803 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
20804 DAG.getNode(ISD::ADD, dl, PtrVT, FrameAddr, Offset),
20805 MachinePointerInfo());
20808 // Just load the return address.
20809 SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
20810 return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), RetAddrFI,
20811 MachinePointerInfo());
20814 SDValue X86TargetLowering::LowerADDROFRETURNADDR(SDValue Op,
20815 SelectionDAG &DAG) const {
20816 DAG.getMachineFunction().getFrameInfo().setReturnAddressIsTaken(true);
20817 return getReturnAddressFrameIndex(DAG);
20820 SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
20821 MachineFunction &MF = DAG.getMachineFunction();
20822 MachineFrameInfo &MFI = MF.getFrameInfo();
20823 X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
20824 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20825 EVT VT = Op.getValueType();
20827 MFI.setFrameAddressIsTaken(true);
20829 if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI()) {
20830 // Depth > 0 makes no sense on targets which use Windows unwind codes. It
20831 // is not possible to crawl up the stack without looking at the unwind codes
20833 int FrameAddrIndex = FuncInfo->getFAIndex();
20834 if (!FrameAddrIndex) {
20835 // Set up a frame object for the return address.
20836 unsigned SlotSize = RegInfo->getSlotSize();
20837 FrameAddrIndex = MF.getFrameInfo().CreateFixedObject(
20838 SlotSize, /*Offset=*/0, /*IsImmutable=*/false);
20839 FuncInfo->setFAIndex(FrameAddrIndex);
20841 return DAG.getFrameIndex(FrameAddrIndex, VT);
20844 unsigned FrameReg =
20845 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20846 SDLoc dl(Op); // FIXME probably not meaningful
20847 unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
20848 assert(((FrameReg == X86::RBP && VT == MVT::i64) ||
20849 (FrameReg == X86::EBP && VT == MVT::i32)) &&
20850 "Invalid Frame Register!");
20851 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
20853 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
20854 MachinePointerInfo());
20858 // FIXME? Maybe this could be a TableGen attribute on some registers and
20859 // this table could be generated automatically from RegInfo.
20860 unsigned X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
20861 SelectionDAG &DAG) const {
20862 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
20863 const MachineFunction &MF = DAG.getMachineFunction();
20865 unsigned Reg = StringSwitch<unsigned>(RegName)
20866 .Case("esp", X86::ESP)
20867 .Case("rsp", X86::RSP)
20868 .Case("ebp", X86::EBP)
20869 .Case("rbp", X86::RBP)
20872 if (Reg == X86::EBP || Reg == X86::RBP) {
20873 if (!TFI.hasFP(MF))
20874 report_fatal_error("register " + StringRef(RegName) +
20875 " is allocatable: function has no frame pointer");
20878 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20879 unsigned FrameReg =
20880 RegInfo->getPtrSizedFrameRegister(DAG.getMachineFunction());
20881 assert((FrameReg == X86::EBP || FrameReg == X86::RBP) &&
20882 "Invalid Frame Register!");
20890 report_fatal_error("Invalid register name global variable");
20893 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
20894 SelectionDAG &DAG) const {
20895 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20896 return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
20899 unsigned X86TargetLowering::getExceptionPointerRegister(
20900 const Constant *PersonalityFn) const {
20901 if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
20902 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20904 return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
20907 unsigned X86TargetLowering::getExceptionSelectorRegister(
20908 const Constant *PersonalityFn) const {
20909 // Funclet personalities don't use selectors (the runtime does the selection).
20910 assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
20911 return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
20914 bool X86TargetLowering::needsFixedCatchObjects() const {
20915 return Subtarget.isTargetWin64();
20918 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
20919 SDValue Chain = Op.getOperand(0);
20920 SDValue Offset = Op.getOperand(1);
20921 SDValue Handler = Op.getOperand(2);
20924 EVT PtrVT = getPointerTy(DAG.getDataLayout());
20925 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
20926 unsigned FrameReg = RegInfo->getFrameRegister(DAG.getMachineFunction());
20927 assert(((FrameReg == X86::RBP && PtrVT == MVT::i64) ||
20928 (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
20929 "Invalid Frame Register!");
20930 SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
20931 unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
20933 SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
20934 DAG.getIntPtrConstant(RegInfo->getSlotSize(),
20936 StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, StoreAddr, Offset);
20937 Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo());
20938 Chain = DAG.getCopyToReg(Chain, dl, StoreAddrReg, StoreAddr);
20940 return DAG.getNode(X86ISD::EH_RETURN, dl, MVT::Other, Chain,
20941 DAG.getRegister(StoreAddrReg, PtrVT));
20944 SDValue X86TargetLowering::lowerEH_SJLJ_SETJMP(SDValue Op,
20945 SelectionDAG &DAG) const {
20947 // If the subtarget is not 64bit, we may need the global base reg
20948 // after isel expand pseudo, i.e., after CGBR pass ran.
20949 // Therefore, ask for the GlobalBaseReg now, so that the pass
20950 // inserts the code for us in case we need it.
20951 // Otherwise, we will end up in a situation where we will
20952 // reference a virtual register that is not defined!
20953 if (!Subtarget.is64Bit()) {
20954 const X86InstrInfo *TII = Subtarget.getInstrInfo();
20955 (void)TII->getGlobalBaseReg(&DAG.getMachineFunction());
20957 return DAG.getNode(X86ISD::EH_SJLJ_SETJMP, DL,
20958 DAG.getVTList(MVT::i32, MVT::Other),
20959 Op.getOperand(0), Op.getOperand(1));
20962 SDValue X86TargetLowering::lowerEH_SJLJ_LONGJMP(SDValue Op,
20963 SelectionDAG &DAG) const {
20965 return DAG.getNode(X86ISD::EH_SJLJ_LONGJMP, DL, MVT::Other,
20966 Op.getOperand(0), Op.getOperand(1));
20969 SDValue X86TargetLowering::lowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
20970 SelectionDAG &DAG) const {
20972 return DAG.getNode(X86ISD::EH_SJLJ_SETUP_DISPATCH, DL, MVT::Other,
20976 static SDValue LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) {
20977 return Op.getOperand(0);
20980 SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
20981 SelectionDAG &DAG) const {
20982 SDValue Root = Op.getOperand(0);
20983 SDValue Trmp = Op.getOperand(1); // trampoline
20984 SDValue FPtr = Op.getOperand(2); // nested function
20985 SDValue Nest = Op.getOperand(3); // 'nest' parameter value
20988 const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
20989 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
20991 if (Subtarget.is64Bit()) {
20992 SDValue OutChains[6];
20994 // Large code-model.
20995 const unsigned char JMP64r = 0xFF; // 64-bit jmp through register opcode.
20996 const unsigned char MOV64ri = 0xB8; // X86::MOV64ri opcode.
20998 const unsigned char N86R10 = TRI->getEncodingValue(X86::R10) & 0x7;
20999 const unsigned char N86R11 = TRI->getEncodingValue(X86::R11) & 0x7;
21001 const unsigned char REX_WB = 0x40 | 0x08 | 0x01; // REX prefix
21003 // Load the pointer to the nested function into R11.
21004 unsigned OpCode = ((MOV64ri | N86R11) << 8) | REX_WB; // movabsq r11
21005 SDValue Addr = Trmp;
21006 OutChains[0] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21007 Addr, MachinePointerInfo(TrmpAddr));
21009 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21010 DAG.getConstant(2, dl, MVT::i64));
21012 DAG.getStore(Root, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 2),
21013 /* Alignment = */ 2);
21015 // Load the 'nest' parameter value into R10.
21016 // R10 is specified in X86CallingConv.td
21017 OpCode = ((MOV64ri | N86R10) << 8) | REX_WB; // movabsq r10
21018 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21019 DAG.getConstant(10, dl, MVT::i64));
21020 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21021 Addr, MachinePointerInfo(TrmpAddr, 10));
21023 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21024 DAG.getConstant(12, dl, MVT::i64));
21026 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 12),
21027 /* Alignment = */ 2);
21029 // Jump to the nested function.
21030 OpCode = (JMP64r << 8) | REX_WB; // jmpq *...
21031 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21032 DAG.getConstant(20, dl, MVT::i64));
21033 OutChains[4] = DAG.getStore(Root, dl, DAG.getConstant(OpCode, dl, MVT::i16),
21034 Addr, MachinePointerInfo(TrmpAddr, 20));
21036 unsigned char ModRM = N86R11 | (4 << 3) | (3 << 6); // ...r11
21037 Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
21038 DAG.getConstant(22, dl, MVT::i64));
21039 OutChains[5] = DAG.getStore(Root, dl, DAG.getConstant(ModRM, dl, MVT::i8),
21040 Addr, MachinePointerInfo(TrmpAddr, 22));
21042 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21044 const Function *Func =
21045 cast<Function>(cast<SrcValueSDNode>(Op.getOperand(5))->getValue());
21046 CallingConv::ID CC = Func->getCallingConv();
21051 llvm_unreachable("Unsupported calling convention");
21052 case CallingConv::C:
21053 case CallingConv::X86_StdCall: {
21054 // Pass 'nest' parameter in ECX.
21055 // Must be kept in sync with X86CallingConv.td
21056 NestReg = X86::ECX;
21058 // Check that ECX wasn't needed by an 'inreg' parameter.
21059 FunctionType *FTy = Func->getFunctionType();
21060 const AttributeList &Attrs = Func->getAttributes();
21062 if (!Attrs.isEmpty() && !Func->isVarArg()) {
21063 unsigned InRegCount = 0;
21066 for (FunctionType::param_iterator I = FTy->param_begin(),
21067 E = FTy->param_end(); I != E; ++I, ++Idx)
21068 if (Attrs.hasAttribute(Idx, Attribute::InReg)) {
21069 auto &DL = DAG.getDataLayout();
21070 // FIXME: should only count parameters that are lowered to integers.
21071 InRegCount += (DL.getTypeSizeInBits(*I) + 31) / 32;
21074 if (InRegCount > 2) {
21075 report_fatal_error("Nest register in use - reduce number of inreg"
21081 case CallingConv::X86_FastCall:
21082 case CallingConv::X86_ThisCall:
21083 case CallingConv::Fast:
21084 // Pass 'nest' parameter in EAX.
21085 // Must be kept in sync with X86CallingConv.td
21086 NestReg = X86::EAX;
21090 SDValue OutChains[4];
21091 SDValue Addr, Disp;
21093 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21094 DAG.getConstant(10, dl, MVT::i32));
21095 Disp = DAG.getNode(ISD::SUB, dl, MVT::i32, FPtr, Addr);
21097 // This is storing the opcode for MOV32ri.
21098 const unsigned char MOV32ri = 0xB8; // X86::MOV32ri's opcode byte.
21099 const unsigned char N86Reg = TRI->getEncodingValue(NestReg) & 0x7;
21101 DAG.getStore(Root, dl, DAG.getConstant(MOV32ri | N86Reg, dl, MVT::i8),
21102 Trmp, MachinePointerInfo(TrmpAddr));
21104 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21105 DAG.getConstant(1, dl, MVT::i32));
21107 DAG.getStore(Root, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 1),
21108 /* Alignment = */ 1);
21110 const unsigned char JMP = 0xE9; // jmp <32bit dst> opcode.
21111 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21112 DAG.getConstant(5, dl, MVT::i32));
21113 OutChains[2] = DAG.getStore(Root, dl, DAG.getConstant(JMP, dl, MVT::i8),
21114 Addr, MachinePointerInfo(TrmpAddr, 5),
21115 /* Alignment = */ 1);
21117 Addr = DAG.getNode(ISD::ADD, dl, MVT::i32, Trmp,
21118 DAG.getConstant(6, dl, MVT::i32));
21120 DAG.getStore(Root, dl, Disp, Addr, MachinePointerInfo(TrmpAddr, 6),
21121 /* Alignment = */ 1);
21123 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
21127 SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
21128 SelectionDAG &DAG) const {
21130 The rounding mode is in bits 11:10 of FPSR, and has the following
21132 00 Round to nearest
21137 FLT_ROUNDS, on the other hand, expects the following:
21144 To perform the conversion, we do:
21145 (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
21148 MachineFunction &MF = DAG.getMachineFunction();
21149 const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
21150 unsigned StackAlignment = TFI.getStackAlignment();
21151 MVT VT = Op.getSimpleValueType();
21154 // Save FP Control Word to stack slot
21155 int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
21156 SDValue StackSlot =
21157 DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
21159 MachineMemOperand *MMO =
21160 MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
21161 MachineMemOperand::MOStore, 2, 2);
21163 SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
21164 SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
21165 DAG.getVTList(MVT::Other),
21166 Ops, MVT::i16, MMO);
21168 // Load FP Control Word from stack slot
21170 DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
21172 // Transform as necessary
21174 DAG.getNode(ISD::SRL, DL, MVT::i16,
21175 DAG.getNode(ISD::AND, DL, MVT::i16,
21176 CWD, DAG.getConstant(0x800, DL, MVT::i16)),
21177 DAG.getConstant(11, DL, MVT::i8));
21179 DAG.getNode(ISD::SRL, DL, MVT::i16,
21180 DAG.getNode(ISD::AND, DL, MVT::i16,
21181 CWD, DAG.getConstant(0x400, DL, MVT::i16)),
21182 DAG.getConstant(9, DL, MVT::i8));
21185 DAG.getNode(ISD::AND, DL, MVT::i16,
21186 DAG.getNode(ISD::ADD, DL, MVT::i16,
21187 DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
21188 DAG.getConstant(1, DL, MVT::i16)),
21189 DAG.getConstant(3, DL, MVT::i16));
21191 return DAG.getNode((VT.getSizeInBits() < 16 ?
21192 ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
21195 // Split an unary integer op into 2 half sized ops.
21196 static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
21197 MVT VT = Op.getSimpleValueType();
21198 unsigned NumElems = VT.getVectorNumElements();
21199 unsigned SizeInBits = VT.getSizeInBits();
21201 // Extract the Lo/Hi vectors
21203 SDValue Src = Op.getOperand(0);
21204 SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
21205 SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
21207 MVT EltVT = VT.getVectorElementType();
21208 MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
21209 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21210 DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
21211 DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
21214 // Decompose 256-bit ops into smaller 128-bit ops.
21215 static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
21216 assert(Op.getSimpleValueType().is256BitVector() &&
21217 Op.getSimpleValueType().isInteger() &&
21218 "Only handle AVX 256-bit vector integer operation");
21219 return LowerVectorIntUnary(Op, DAG);
21222 // Decompose 512-bit ops into smaller 256-bit ops.
21223 static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
21224 assert(Op.getSimpleValueType().is512BitVector() &&
21225 Op.getSimpleValueType().isInteger() &&
21226 "Only handle AVX 512-bit vector integer operation");
21227 return LowerVectorIntUnary(Op, DAG);
21230 /// \brief Lower a vector CTLZ using native supported vector CTLZ instruction.
21232 // i8/i16 vector implemented using dword LZCNT vector instruction
21233 // ( sub(trunc(lzcnt(zext32(x)))) ). In case zext32(x) is illegal,
21234 // split the vector, perform operation on it's Lo a Hi part and
21235 // concatenate the results.
21236 static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG) {
21237 assert(Op.getOpcode() == ISD::CTLZ);
21239 MVT VT = Op.getSimpleValueType();
21240 MVT EltVT = VT.getVectorElementType();
21241 unsigned NumElems = VT.getVectorNumElements();
21243 assert((EltVT == MVT::i8 || EltVT == MVT::i16) &&
21244 "Unsupported element type");
21246 // Split vector, it's Lo and Hi parts will be handled in next iteration.
21248 return LowerVectorIntUnary(Op, DAG);
21250 MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
21251 assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
21252 "Unsupported value type for operation");
21254 // Use native supported vector instruction vplzcntd.
21255 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, NewVT, Op.getOperand(0));
21256 SDValue CtlzNode = DAG.getNode(ISD::CTLZ, dl, NewVT, Op);
21257 SDValue TruncNode = DAG.getNode(ISD::TRUNCATE, dl, VT, CtlzNode);
21258 SDValue Delta = DAG.getConstant(32 - EltVT.getSizeInBits(), dl, VT);
21260 return DAG.getNode(ISD::SUB, dl, VT, TruncNode, Delta);
21263 // Lower CTLZ using a PSHUFB lookup table implementation.
21264 static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
21265 const X86Subtarget &Subtarget,
21266 SelectionDAG &DAG) {
21267 MVT VT = Op.getSimpleValueType();
21268 int NumElts = VT.getVectorNumElements();
21269 int NumBytes = NumElts * (VT.getScalarSizeInBits() / 8);
21270 MVT CurrVT = MVT::getVectorVT(MVT::i8, NumBytes);
21272 // Per-nibble leading zero PSHUFB lookup table.
21273 const int LUT[16] = {/* 0 */ 4, /* 1 */ 3, /* 2 */ 2, /* 3 */ 2,
21274 /* 4 */ 1, /* 5 */ 1, /* 6 */ 1, /* 7 */ 1,
21275 /* 8 */ 0, /* 9 */ 0, /* a */ 0, /* b */ 0,
21276 /* c */ 0, /* d */ 0, /* e */ 0, /* f */ 0};
21278 SmallVector<SDValue, 64> LUTVec;
21279 for (int i = 0; i < NumBytes; ++i)
21280 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
21281 SDValue InRegLUT = DAG.getBuildVector(CurrVT, DL, LUTVec);
21283 // Begin by bitcasting the input to byte vector, then split those bytes
21284 // into lo/hi nibbles and use the PSHUFB LUT to perform CLTZ on each of them.
21285 // If the hi input nibble is zero then we add both results together, otherwise
21286 // we just take the hi result (by masking the lo result to zero before the
21288 SDValue Op0 = DAG.getBitcast(CurrVT, Op.getOperand(0));
21289 SDValue Zero = getZeroVector(CurrVT, Subtarget, DAG, DL);
21291 SDValue NibbleMask = DAG.getConstant(0xF, DL, CurrVT);
21292 SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
21293 SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
21294 SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
21295 SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
21297 Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
21298 Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
21299 Lo = DAG.getNode(ISD::AND, DL, CurrVT, Lo, HiZ);
21300 SDValue Res = DAG.getNode(ISD::ADD, DL, CurrVT, Lo, Hi);
21302 // Merge result back from vXi8 back to VT, working on the lo/hi halves
21303 // of the current vector width in the same way we did for the nibbles.
21304 // If the upper half of the input element is zero then add the halves'
21305 // leading zero counts together, otherwise just use the upper half's.
21306 // Double the width of the result until we are at target width.
21307 while (CurrVT != VT) {
21308 int CurrScalarSizeInBits = CurrVT.getScalarSizeInBits();
21309 int CurrNumElts = CurrVT.getVectorNumElements();
21310 MVT NextSVT = MVT::getIntegerVT(CurrScalarSizeInBits * 2);
21311 MVT NextVT = MVT::getVectorVT(NextSVT, CurrNumElts / 2);
21312 SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
21314 // Check if the upper half of the input element is zero.
21315 SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
21316 DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
21317 HiZ = DAG.getBitcast(NextVT, HiZ);
21319 // Move the upper/lower halves to the lower bits as we'll be extending to
21320 // NextVT. Mask the lower result to zero if HiZ is true and add the results
21322 SDValue ResNext = Res = DAG.getBitcast(NextVT, Res);
21323 SDValue R0 = DAG.getNode(ISD::SRL, DL, NextVT, ResNext, Shift);
21324 SDValue R1 = DAG.getNode(ISD::SRL, DL, NextVT, HiZ, Shift);
21325 R1 = DAG.getNode(ISD::AND, DL, NextVT, ResNext, R1);
21326 Res = DAG.getNode(ISD::ADD, DL, NextVT, R0, R1);
21333 static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
21334 const X86Subtarget &Subtarget,
21335 SelectionDAG &DAG) {
21336 MVT VT = Op.getSimpleValueType();
21338 if (Subtarget.hasCDI())
21339 return LowerVectorCTLZ_AVX512CDI(Op, DAG);
21341 // Decompose 256-bit ops into smaller 128-bit ops.
21342 if (VT.is256BitVector() && !Subtarget.hasInt256())
21343 return Lower256IntUnary(Op, DAG);
21345 // Decompose 512-bit ops into smaller 256-bit ops.
21346 if (VT.is512BitVector() && !Subtarget.hasBWI())
21347 return Lower512IntUnary(Op, DAG);
21349 assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
21350 return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
21353 static SDValue LowerCTLZ(SDValue Op, const X86Subtarget &Subtarget,
21354 SelectionDAG &DAG) {
21355 MVT VT = Op.getSimpleValueType();
21357 unsigned NumBits = VT.getSizeInBits();
21359 unsigned Opc = Op.getOpcode();
21362 return LowerVectorCTLZ(Op, dl, Subtarget, DAG);
21364 Op = Op.getOperand(0);
21365 if (VT == MVT::i8) {
21366 // Zero extend to i32 since there is not an i8 bsr.
21368 Op = DAG.getNode(ISD::ZERO_EXTEND, dl, OpVT, Op);
21371 // Issue a bsr (scan bits in reverse) which also sets EFLAGS.
21372 SDVTList VTs = DAG.getVTList(OpVT, MVT::i32);
21373 Op = DAG.getNode(X86ISD::BSR, dl, VTs, Op);
21375 if (Opc == ISD::CTLZ) {
21376 // If src is zero (i.e. bsr sets ZF), returns NumBits.
21379 DAG.getConstant(NumBits + NumBits - 1, dl, OpVT),
21380 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21383 Op = DAG.getNode(X86ISD::CMOV, dl, OpVT, Ops);
21386 // Finally xor with NumBits-1.
21387 Op = DAG.getNode(ISD::XOR, dl, OpVT, Op,
21388 DAG.getConstant(NumBits - 1, dl, OpVT));
21391 Op = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Op);
21395 static SDValue LowerCTTZ(SDValue Op, SelectionDAG &DAG) {
21396 MVT VT = Op.getSimpleValueType();
21397 unsigned NumBits = VT.getScalarSizeInBits();
21400 if (VT.isVector()) {
21401 SDValue N0 = Op.getOperand(0);
21402 SDValue Zero = DAG.getConstant(0, dl, VT);
21404 // lsb(x) = (x & -x)
21405 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, N0,
21406 DAG.getNode(ISD::SUB, dl, VT, Zero, N0));
21408 // cttz_undef(x) = (width - 1) - ctlz(lsb)
21409 if (Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF) {
21410 SDValue WidthMinusOne = DAG.getConstant(NumBits - 1, dl, VT);
21411 return DAG.getNode(ISD::SUB, dl, VT, WidthMinusOne,
21412 DAG.getNode(ISD::CTLZ, dl, VT, LSB));
21415 // cttz(x) = ctpop(lsb - 1)
21416 SDValue One = DAG.getConstant(1, dl, VT);
21417 return DAG.getNode(ISD::CTPOP, dl, VT,
21418 DAG.getNode(ISD::SUB, dl, VT, LSB, One));
21421 assert(Op.getOpcode() == ISD::CTTZ &&
21422 "Only scalar CTTZ requires custom lowering");
21424 // Issue a bsf (scan bits forward) which also sets EFLAGS.
21425 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
21426 Op = DAG.getNode(X86ISD::BSF, dl, VTs, Op.getOperand(0));
21428 // If src is zero (i.e. bsf sets ZF), returns NumBits.
21431 DAG.getConstant(NumBits, dl, VT),
21432 DAG.getConstant(X86::COND_E, dl, MVT::i8),
21435 return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
21438 /// Break a 256-bit integer operation into two new 128-bit ones and then
21439 /// concatenate the result back.
21440 static SDValue Lower256IntArith(SDValue Op, SelectionDAG &DAG) {
21441 MVT VT = Op.getSimpleValueType();
21443 assert(VT.is256BitVector() && VT.isInteger() &&
21444 "Unsupported value type for operation");
21446 unsigned NumElems = VT.getVectorNumElements();
21449 // Extract the LHS vectors
21450 SDValue LHS = Op.getOperand(0);
21451 SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
21452 SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
21454 // Extract the RHS vectors
21455 SDValue RHS = Op.getOperand(1);
21456 SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
21457 SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
21459 MVT EltVT = VT.getVectorElementType();
21460 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21462 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21463 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21464 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21467 /// Break a 512-bit integer operation into two new 256-bit ones and then
21468 /// concatenate the result back.
21469 static SDValue Lower512IntArith(SDValue Op, SelectionDAG &DAG) {
21470 MVT VT = Op.getSimpleValueType();
21472 assert(VT.is512BitVector() && VT.isInteger() &&
21473 "Unsupported value type for operation");
21475 unsigned NumElems = VT.getVectorNumElements();
21478 // Extract the LHS vectors
21479 SDValue LHS = Op.getOperand(0);
21480 SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
21481 SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
21483 // Extract the RHS vectors
21484 SDValue RHS = Op.getOperand(1);
21485 SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
21486 SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
21488 MVT EltVT = VT.getVectorElementType();
21489 MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
21491 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
21492 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
21493 DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
21496 static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
21497 MVT VT = Op.getSimpleValueType();
21498 if (VT.getScalarType() == MVT::i1)
21499 return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
21500 Op.getOperand(0), Op.getOperand(1));
21501 assert(Op.getSimpleValueType().is256BitVector() &&
21502 Op.getSimpleValueType().isInteger() &&
21503 "Only handle AVX 256-bit vector integer operation");
21504 return Lower256IntArith(Op, DAG);
21507 static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
21508 assert(Op.getSimpleValueType().is256BitVector() &&
21509 Op.getSimpleValueType().isInteger() &&
21510 "Only handle AVX 256-bit vector integer operation");
21511 return Lower256IntUnary(Op, DAG);
21514 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
21515 assert(Op.getSimpleValueType().is256BitVector() &&
21516 Op.getSimpleValueType().isInteger() &&
21517 "Only handle AVX 256-bit vector integer operation");
21518 return Lower256IntArith(Op, DAG);
21521 static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
21522 SelectionDAG &DAG) {
21524 MVT VT = Op.getSimpleValueType();
21526 if (VT.getScalarType() == MVT::i1)
21527 return DAG.getNode(ISD::AND, dl, VT, Op.getOperand(0), Op.getOperand(1));
21529 // Decompose 256-bit ops into smaller 128-bit ops.
21530 if (VT.is256BitVector() && !Subtarget.hasInt256())
21531 return Lower256IntArith(Op, DAG);
21533 SDValue A = Op.getOperand(0);
21534 SDValue B = Op.getOperand(1);
21536 // Lower v16i8/v32i8/v64i8 mul as sign-extension to v8i16/v16i16/v32i16
21537 // vector pairs, multiply and truncate.
21538 if (VT == MVT::v16i8 || VT == MVT::v32i8 || VT == MVT::v64i8) {
21539 if (Subtarget.hasInt256()) {
21540 // For 512-bit vectors, split into 256-bit vectors to allow the
21541 // sign-extension to occur.
21542 if (VT == MVT::v64i8)
21543 return Lower512IntArith(Op, DAG);
21545 // For 256-bit vectors, split into 128-bit vectors to allow the
21546 // sign-extension to occur. We don't need this on AVX512BW as we can
21547 // safely sign-extend to v32i16.
21548 if (VT == MVT::v32i8 && !Subtarget.hasBWI())
21549 return Lower256IntArith(Op, DAG);
21551 MVT ExVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements());
21552 return DAG.getNode(
21553 ISD::TRUNCATE, dl, VT,
21554 DAG.getNode(ISD::MUL, dl, ExVT,
21555 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, A),
21556 DAG.getNode(ISD::SIGN_EXTEND, dl, ExVT, B)));
21559 assert(VT == MVT::v16i8 &&
21560 "Pre-AVX2 support only supports v16i8 multiplication");
21561 MVT ExVT = MVT::v8i16;
21563 // Extract the lo parts and sign extend to i16
21565 if (Subtarget.hasSSE41()) {
21566 ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
21567 BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
21569 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21570 -1, 4, -1, 5, -1, 6, -1, 7};
21571 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21572 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21573 ALo = DAG.getBitcast(ExVT, ALo);
21574 BLo = DAG.getBitcast(ExVT, BLo);
21575 ALo = DAG.getNode(ISD::SRA, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21576 BLo = DAG.getNode(ISD::SRA, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21579 // Extract the hi parts and sign extend to i16
21581 if (Subtarget.hasSSE41()) {
21582 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21583 -1, -1, -1, -1, -1, -1, -1, -1};
21584 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21585 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21586 AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
21587 BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
21589 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21590 -1, 12, -1, 13, -1, 14, -1, 15};
21591 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21592 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21593 AHi = DAG.getBitcast(ExVT, AHi);
21594 BHi = DAG.getBitcast(ExVT, BHi);
21595 AHi = DAG.getNode(ISD::SRA, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21596 BHi = DAG.getNode(ISD::SRA, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21599 // Multiply, mask the lower 8bits of the lo/hi results and pack
21600 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21601 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21602 RLo = DAG.getNode(ISD::AND, dl, ExVT, RLo, DAG.getConstant(255, dl, ExVT));
21603 RHi = DAG.getNode(ISD::AND, dl, ExVT, RHi, DAG.getConstant(255, dl, ExVT));
21604 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21607 // Lower v4i32 mul as 2x shuffle, 2x pmuludq, 2x shuffle.
21608 if (VT == MVT::v4i32) {
21609 assert(Subtarget.hasSSE2() && !Subtarget.hasSSE41() &&
21610 "Should not custom lower when pmuldq is available!");
21612 // Extract the odd parts.
21613 static const int UnpackMask[] = { 1, -1, 3, -1 };
21614 SDValue Aodds = DAG.getVectorShuffle(VT, dl, A, A, UnpackMask);
21615 SDValue Bodds = DAG.getVectorShuffle(VT, dl, B, B, UnpackMask);
21617 // Multiply the even parts.
21618 SDValue Evens = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, A, B);
21619 // Now multiply odd parts.
21620 SDValue Odds = DAG.getNode(X86ISD::PMULUDQ, dl, MVT::v2i64, Aodds, Bodds);
21622 Evens = DAG.getBitcast(VT, Evens);
21623 Odds = DAG.getBitcast(VT, Odds);
21625 // Merge the two vectors back together with a shuffle. This expands into 2
21627 static const int ShufMask[] = { 0, 4, 2, 6 };
21628 return DAG.getVectorShuffle(VT, dl, Evens, Odds, ShufMask);
21631 assert((VT == MVT::v2i64 || VT == MVT::v4i64 || VT == MVT::v8i64) &&
21632 "Only know how to lower V2I64/V4I64/V8I64 multiply");
21634 // 32-bit vector types used for MULDQ/MULUDQ.
21635 MVT MulVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits() / 32);
21637 // MULDQ returns the 64-bit result of the signed multiplication of the lower
21638 // 32-bits. We can lower with this if the sign bits stretch that far.
21639 if (Subtarget.hasSSE41() && DAG.ComputeNumSignBits(A) > 32 &&
21640 DAG.ComputeNumSignBits(B) > 32) {
21641 return DAG.getNode(X86ISD::PMULDQ, dl, VT, DAG.getBitcast(MulVT, A),
21642 DAG.getBitcast(MulVT, B));
21645 // Ahi = psrlqi(a, 32);
21646 // Bhi = psrlqi(b, 32);
21648 // AloBlo = pmuludq(a, b);
21649 // AloBhi = pmuludq(a, Bhi);
21650 // AhiBlo = pmuludq(Ahi, b);
21652 // Hi = psllqi(AloBhi + AhiBlo, 32);
21653 // return AloBlo + Hi;
21654 APInt LowerBitsMask = APInt::getLowBitsSet(64, 32);
21655 bool ALoIsZero = DAG.MaskedValueIsZero(A, LowerBitsMask);
21656 bool BLoIsZero = DAG.MaskedValueIsZero(B, LowerBitsMask);
21658 APInt UpperBitsMask = APInt::getHighBitsSet(64, 32);
21659 bool AHiIsZero = DAG.MaskedValueIsZero(A, UpperBitsMask);
21660 bool BHiIsZero = DAG.MaskedValueIsZero(B, UpperBitsMask);
21662 // Bit cast to 32-bit vectors for MULUDQ.
21663 SDValue Alo = DAG.getBitcast(MulVT, A);
21664 SDValue Blo = DAG.getBitcast(MulVT, B);
21666 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
21668 // Only multiply lo/hi halves that aren't known to be zero.
21669 SDValue AloBlo = Zero;
21670 if (!ALoIsZero && !BLoIsZero)
21671 AloBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Blo);
21673 SDValue AloBhi = Zero;
21674 if (!ALoIsZero && !BHiIsZero) {
21675 SDValue Bhi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, B, 32, DAG);
21676 Bhi = DAG.getBitcast(MulVT, Bhi);
21677 AloBhi = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Alo, Bhi);
21680 SDValue AhiBlo = Zero;
21681 if (!AHiIsZero && !BLoIsZero) {
21682 SDValue Ahi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, A, 32, DAG);
21683 Ahi = DAG.getBitcast(MulVT, Ahi);
21684 AhiBlo = DAG.getNode(X86ISD::PMULUDQ, dl, VT, Ahi, Blo);
21687 SDValue Hi = DAG.getNode(ISD::ADD, dl, VT, AloBhi, AhiBlo);
21688 Hi = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, VT, Hi, 32, DAG);
21690 return DAG.getNode(ISD::ADD, dl, VT, AloBlo, Hi);
21693 static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
21694 SelectionDAG &DAG) {
21696 MVT VT = Op.getSimpleValueType();
21698 // Decompose 256-bit ops into smaller 128-bit ops.
21699 if (VT.is256BitVector() && !Subtarget.hasInt256())
21700 return Lower256IntArith(Op, DAG);
21702 // Only i8 vectors should need custom lowering after this.
21703 assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
21704 "Unsupported vector type");
21706 // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
21707 // logical shift down the upper half and pack back to i8.
21708 SDValue A = Op.getOperand(0);
21709 SDValue B = Op.getOperand(1);
21711 // With SSE41 we can use sign/zero extend, but for pre-SSE41 we unpack
21712 // and then ashr/lshr the upper bits down to the lower bits before multiply.
21713 unsigned Opcode = Op.getOpcode();
21714 unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
21715 unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
21717 // AVX2 implementations - extend xmm subvectors to ymm.
21718 if (Subtarget.hasInt256()) {
21719 SDValue Lo = DAG.getIntPtrConstant(0, dl);
21720 SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
21722 if (VT == MVT::v32i8) {
21723 SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
21724 SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
21725 SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
21726 SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
21727 ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
21728 BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
21729 AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
21730 BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
21731 Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21732 DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
21733 DAG.getConstant(8, dl, MVT::v16i16));
21734 Hi = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
21735 DAG.getNode(ISD::MUL, dl, MVT::v16i16, AHi, BHi),
21736 DAG.getConstant(8, dl, MVT::v16i16));
21737 // The ymm variant of PACKUS treats the 128-bit lanes separately, so before
21738 // using PACKUS we need to permute the inputs to the correct lo/hi xmm lane.
21739 const int LoMask[] = {0, 1, 2, 3, 4, 5, 6, 7,
21740 16, 17, 18, 19, 20, 21, 22, 23};
21741 const int HiMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21742 24, 25, 26, 27, 28, 29, 30, 31};
21743 return DAG.getNode(X86ISD::PACKUS, dl, VT,
21744 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, LoMask),
21745 DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
21748 SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
21749 SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
21750 SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
21751 SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
21752 DAG.getConstant(8, dl, MVT::v16i16));
21753 Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
21754 Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
21755 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
21758 assert(VT == MVT::v16i8 &&
21759 "Pre-AVX2 support only supports v16i8 multiplication");
21760 MVT ExVT = MVT::v8i16;
21762 // Extract the lo parts and zero/sign extend to i16.
21764 if (Subtarget.hasSSE41()) {
21765 ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
21766 BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
21768 const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
21769 -1, 4, -1, 5, -1, 6, -1, 7};
21770 ALo = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21771 BLo = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21772 ALo = DAG.getBitcast(ExVT, ALo);
21773 BLo = DAG.getBitcast(ExVT, BLo);
21774 ALo = DAG.getNode(ExShift, dl, ExVT, ALo, DAG.getConstant(8, dl, ExVT));
21775 BLo = DAG.getNode(ExShift, dl, ExVT, BLo, DAG.getConstant(8, dl, ExVT));
21778 // Extract the hi parts and zero/sign extend to i16.
21780 if (Subtarget.hasSSE41()) {
21781 const int ShufMask[] = {8, 9, 10, 11, 12, 13, 14, 15,
21782 -1, -1, -1, -1, -1, -1, -1, -1};
21783 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21784 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21785 AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
21786 BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
21788 const int ShufMask[] = {-1, 8, -1, 9, -1, 10, -1, 11,
21789 -1, 12, -1, 13, -1, 14, -1, 15};
21790 AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
21791 BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
21792 AHi = DAG.getBitcast(ExVT, AHi);
21793 BHi = DAG.getBitcast(ExVT, BHi);
21794 AHi = DAG.getNode(ExShift, dl, ExVT, AHi, DAG.getConstant(8, dl, ExVT));
21795 BHi = DAG.getNode(ExShift, dl, ExVT, BHi, DAG.getConstant(8, dl, ExVT));
21798 // Multiply, lshr the upper 8bits to the lower 8bits of the lo/hi results and
21799 // pack back to v16i8.
21800 SDValue RLo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
21801 SDValue RHi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
21802 RLo = DAG.getNode(ISD::SRL, dl, ExVT, RLo, DAG.getConstant(8, dl, ExVT));
21803 RHi = DAG.getNode(ISD::SRL, dl, ExVT, RHi, DAG.getConstant(8, dl, ExVT));
21804 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
21807 SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const {
21808 assert(Subtarget.isTargetWin64() && "Unexpected target");
21809 EVT VT = Op.getValueType();
21810 assert(VT.isInteger() && VT.getSizeInBits() == 128 &&
21811 "Unexpected return type for lowering");
21815 switch (Op->getOpcode()) {
21816 default: llvm_unreachable("Unexpected request for libcall!");
21817 case ISD::SDIV: isSigned = true; LC = RTLIB::SDIV_I128; break;
21818 case ISD::UDIV: isSigned = false; LC = RTLIB::UDIV_I128; break;
21819 case ISD::SREM: isSigned = true; LC = RTLIB::SREM_I128; break;
21820 case ISD::UREM: isSigned = false; LC = RTLIB::UREM_I128; break;
21821 case ISD::SDIVREM: isSigned = true; LC = RTLIB::SDIVREM_I128; break;
21822 case ISD::UDIVREM: isSigned = false; LC = RTLIB::UDIVREM_I128; break;
21826 SDValue InChain = DAG.getEntryNode();
21828 TargetLowering::ArgListTy Args;
21829 TargetLowering::ArgListEntry Entry;
21830 for (unsigned i = 0, e = Op->getNumOperands(); i != e; ++i) {
21831 EVT ArgVT = Op->getOperand(i).getValueType();
21832 assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
21833 "Unexpected argument type for lowering");
21834 SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
21835 Entry.Node = StackPtr;
21836 InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
21837 MachinePointerInfo(), /* Alignment = */ 16);
21838 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
21839 Entry.Ty = PointerType::get(ArgTy,0);
21840 Entry.IsSExt = false;
21841 Entry.IsZExt = false;
21842 Args.push_back(Entry);
21845 SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
21846 getPointerTy(DAG.getDataLayout()));
21848 TargetLowering::CallLoweringInfo CLI(DAG);
21849 CLI.setDebugLoc(dl)
21852 getLibcallCallingConv(LC),
21853 static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
21856 .setSExtResult(isSigned)
21857 .setZExtResult(!isSigned);
21859 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
21860 return DAG.getBitcast(VT, CallInfo.first);
21863 static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
21864 SelectionDAG &DAG) {
21865 SDValue Op0 = Op.getOperand(0), Op1 = Op.getOperand(1);
21866 MVT VT = Op0.getSimpleValueType();
21869 // Decompose 256-bit ops into smaller 128-bit ops.
21870 if (VT.is256BitVector() && !Subtarget.hasInt256()) {
21871 unsigned Opcode = Op.getOpcode();
21872 unsigned NumElems = VT.getVectorNumElements();
21873 MVT HalfVT = MVT::getVectorVT(VT.getScalarType(), NumElems / 2);
21874 SDValue Lo0 = extract128BitVector(Op0, 0, DAG, dl);
21875 SDValue Lo1 = extract128BitVector(Op1, 0, DAG, dl);
21876 SDValue Hi0 = extract128BitVector(Op0, NumElems / 2, DAG, dl);
21877 SDValue Hi1 = extract128BitVector(Op1, NumElems / 2, DAG, dl);
21878 SDValue Lo = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Lo0, Lo1);
21879 SDValue Hi = DAG.getNode(Opcode, dl, DAG.getVTList(HalfVT, HalfVT), Hi0, Hi1);
21881 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(0), Hi.getValue(0)),
21882 DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo.getValue(1), Hi.getValue(1))
21884 return DAG.getMergeValues(Ops, dl);
21887 assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
21888 (VT == MVT::v8i32 && Subtarget.hasInt256()));
21890 // PMULxD operations multiply each even value (starting at 0) of LHS with
21891 // the related value of RHS and produce a widen result.
21892 // E.g., PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21893 // => <2 x i64> <ae|cg>
21895 // In other word, to have all the results, we need to perform two PMULxD:
21896 // 1. one with the even values.
21897 // 2. one with the odd values.
21898 // To achieve #2, with need to place the odd values at an even position.
21900 // Place the odd value at an even position (basically, shift all values 1
21901 // step to the left):
21902 const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
21903 // <a|b|c|d> => <b|undef|d|undef>
21904 SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
21905 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21906 // <e|f|g|h> => <f|undef|h|undef>
21907 SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
21908 makeArrayRef(&Mask[0], VT.getVectorNumElements()));
21910 // Emit two multiplies, one for the lower 2 ints and one for the higher 2
21912 MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
21913 bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
21915 (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
21916 // PMULUDQ <4 x i32> <a|b|c|d>, <4 x i32> <e|f|g|h>
21917 // => <2 x i64> <ae|cg>
21918 SDValue Mul1 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Op0, Op1));
21919 // PMULUDQ <4 x i32> <b|undef|d|undef>, <4 x i32> <f|undef|h|undef>
21920 // => <2 x i64> <bf|dh>
21921 SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
21923 // Shuffle it back into the right order.
21924 SDValue Highs, Lows;
21925 if (VT == MVT::v8i32) {
21926 const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
21927 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21928 const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
21929 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21931 const int HighMask[] = {1, 5, 3, 7};
21932 Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
21933 const int LowMask[] = {0, 4, 2, 6};
21934 Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
21937 // If we have a signed multiply but no PMULDQ fix up the high parts of a
21938 // unsigned multiply.
21939 if (IsSigned && !Subtarget.hasSSE41()) {
21940 SDValue ShAmt = DAG.getConstant(
21942 DAG.getTargetLoweringInfo().getShiftAmountTy(VT, DAG.getDataLayout()));
21943 SDValue T1 = DAG.getNode(ISD::AND, dl, VT,
21944 DAG.getNode(ISD::SRA, dl, VT, Op0, ShAmt), Op1);
21945 SDValue T2 = DAG.getNode(ISD::AND, dl, VT,
21946 DAG.getNode(ISD::SRA, dl, VT, Op1, ShAmt), Op0);
21948 SDValue Fixup = DAG.getNode(ISD::ADD, dl, VT, T1, T2);
21949 Highs = DAG.getNode(ISD::SUB, dl, VT, Highs, Fixup);
21952 // The first result of MUL_LOHI is actually the low value, followed by the
21954 SDValue Ops[] = {Lows, Highs};
21955 return DAG.getMergeValues(Ops, dl);
21958 // Return true if the required (according to Opcode) shift-imm form is natively
21959 // supported by the Subtarget
21960 static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
21962 if (VT.getScalarSizeInBits() < 16)
21965 if (VT.is512BitVector() && Subtarget.hasAVX512() &&
21966 (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
21969 bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
21970 (VT.is256BitVector() && Subtarget.hasInt256());
21972 bool AShift = LShift && (Subtarget.hasAVX512() ||
21973 (VT != MVT::v2i64 && VT != MVT::v4i64));
21974 return (Opcode == ISD::SRA) ? AShift : LShift;
21977 // The shift amount is a variable, but it is the same for all vector lanes.
21978 // These instructions are defined together with shift-immediate.
21980 bool SupportedVectorShiftWithBaseAmnt(MVT VT, const X86Subtarget &Subtarget,
21982 return SupportedVectorShiftWithImm(VT, Subtarget, Opcode);
21985 // Return true if the required (according to Opcode) variable-shift form is
21986 // natively supported by the Subtarget
21987 static bool SupportedVectorVarShift(MVT VT, const X86Subtarget &Subtarget,
21990 if (!Subtarget.hasInt256() || VT.getScalarSizeInBits() < 16)
21993 // vXi16 supported only on AVX-512, BWI
21994 if (VT.getScalarSizeInBits() == 16 && !Subtarget.hasBWI())
21997 if (Subtarget.hasAVX512())
22000 bool LShift = VT.is128BitVector() || VT.is256BitVector();
22001 bool AShift = LShift && VT != MVT::v2i64 && VT != MVT::v4i64;
22002 return (Opcode == ISD::SRA) ? AShift : LShift;
22005 static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
22006 const X86Subtarget &Subtarget) {
22007 MVT VT = Op.getSimpleValueType();
22009 SDValue R = Op.getOperand(0);
22010 SDValue Amt = Op.getOperand(1);
22012 unsigned X86Opc = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22013 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22015 auto ArithmeticShiftRight64 = [&](uint64_t ShiftAmt) {
22016 assert((VT == MVT::v2i64 || VT == MVT::v4i64) && "Unexpected SRA type");
22017 MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
22018 SDValue Ex = DAG.getBitcast(ExVT, R);
22020 // ashr(R, 63) === cmp_slt(R, 0)
22021 if (ShiftAmt == 63 && Subtarget.hasSSE42()) {
22022 assert((VT != MVT::v4i64 || Subtarget.hasInt256()) &&
22023 "Unsupported PCMPGT op");
22024 return DAG.getNode(X86ISD::PCMPGT, dl, VT,
22025 getZeroVector(VT, Subtarget, DAG, dl), R);
22028 if (ShiftAmt >= 32) {
22029 // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32.
22031 getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex, 31, DAG);
22032 SDValue Lower = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22033 ShiftAmt - 32, DAG);
22034 if (VT == MVT::v2i64)
22035 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {5, 1, 7, 3});
22036 if (VT == MVT::v4i64)
22037 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22038 {9, 1, 11, 3, 13, 5, 15, 7});
22040 // SRA upper i32, SHL whole i64 and select lower i32.
22041 SDValue Upper = getTargetVShiftByConstNode(X86ISD::VSRAI, dl, ExVT, Ex,
22044 getTargetVShiftByConstNode(X86ISD::VSRLI, dl, VT, R, ShiftAmt, DAG);
22045 Lower = DAG.getBitcast(ExVT, Lower);
22046 if (VT == MVT::v2i64)
22047 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower, {4, 1, 6, 3});
22048 if (VT == MVT::v4i64)
22049 Ex = DAG.getVectorShuffle(ExVT, dl, Upper, Lower,
22050 {8, 1, 10, 3, 12, 5, 14, 7});
22052 return DAG.getBitcast(VT, Ex);
22055 // Optimize shl/srl/sra with constant shift amount.
22056 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22057 if (auto *ShiftConst = BVAmt->getConstantSplatNode()) {
22058 uint64_t ShiftAmt = ShiftConst->getZExtValue();
22060 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22061 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22063 // i64 SRA needs to be performed as partial shifts.
22064 if (((!Subtarget.hasXOP() && VT == MVT::v2i64) ||
22065 (Subtarget.hasInt256() && VT == MVT::v4i64)) &&
22066 Op.getOpcode() == ISD::SRA)
22067 return ArithmeticShiftRight64(ShiftAmt);
22069 if (VT == MVT::v16i8 ||
22070 (Subtarget.hasInt256() && VT == MVT::v32i8) ||
22071 VT == MVT::v64i8) {
22072 unsigned NumElts = VT.getVectorNumElements();
22073 MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
22075 // Simple i8 add case
22076 if (Op.getOpcode() == ISD::SHL && ShiftAmt == 1)
22077 return DAG.getNode(ISD::ADD, dl, VT, R, R);
22079 // ashr(R, 7) === cmp_slt(R, 0)
22080 if (Op.getOpcode() == ISD::SRA && ShiftAmt == 7) {
22081 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, dl);
22082 if (VT.is512BitVector()) {
22083 assert(VT == MVT::v64i8 && "Unexpected element type!");
22084 SDValue CMP = DAG.getNode(X86ISD::PCMPGTM, dl, MVT::v64i1, Zeros, R);
22085 return DAG.getNode(ISD::SIGN_EXTEND, dl, VT, CMP);
22087 return DAG.getNode(X86ISD::PCMPGT, dl, VT, Zeros, R);
22090 // XOP can shift v16i8 directly instead of as shift v8i16 + mask.
22091 if (VT == MVT::v16i8 && Subtarget.hasXOP())
22094 if (Op.getOpcode() == ISD::SHL) {
22095 // Make a large shift.
22096 SDValue SHL = getTargetVShiftByConstNode(X86ISD::VSHLI, dl, ShiftVT,
22098 SHL = DAG.getBitcast(VT, SHL);
22099 // Zero out the rightmost bits.
22100 return DAG.getNode(ISD::AND, dl, VT, SHL,
22101 DAG.getConstant(uint8_t(-1U << ShiftAmt), dl, VT));
22103 if (Op.getOpcode() == ISD::SRL) {
22104 // Make a large shift.
22105 SDValue SRL = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ShiftVT,
22107 SRL = DAG.getBitcast(VT, SRL);
22108 // Zero out the leftmost bits.
22109 return DAG.getNode(ISD::AND, dl, VT, SRL,
22110 DAG.getConstant(uint8_t(-1U) >> ShiftAmt, dl, VT));
22112 if (Op.getOpcode() == ISD::SRA) {
22113 // ashr(R, Amt) === sub(xor(lshr(R, Amt), Mask), Mask)
22114 SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22116 SDValue Mask = DAG.getConstant(128 >> ShiftAmt, dl, VT);
22117 Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
22118 Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
22121 llvm_unreachable("Unknown shift opcode.");
22126 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22127 // TODO: Replace constant extraction with getTargetConstantBitsFromNode.
22128 if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
22129 (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
22130 (Subtarget.hasAVX512() && VT == MVT::v8i64))) {
22132 // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant.
22133 unsigned SubVectorScale = 1;
22134 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
22136 Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits();
22137 Amt = Amt.getOperand(0);
22140 // Peek through any splat that was introduced for i64 shift vectorization.
22141 int SplatIndex = -1;
22142 if (ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt.getNode()))
22143 if (SVN->isSplat()) {
22144 SplatIndex = SVN->getSplatIndex();
22145 Amt = Amt.getOperand(0);
22146 assert(SplatIndex < (int)VT.getVectorNumElements() &&
22147 "Splat shuffle referencing second operand");
22150 if (Amt.getOpcode() != ISD::BITCAST ||
22151 Amt.getOperand(0).getOpcode() != ISD::BUILD_VECTOR)
22154 Amt = Amt.getOperand(0);
22155 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22156 (SubVectorScale * VT.getVectorNumElements());
22157 unsigned RatioInLog2 = Log2_32_Ceil(Ratio);
22158 uint64_t ShiftAmt = 0;
22159 unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio);
22160 for (unsigned i = 0; i != Ratio; ++i) {
22161 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + BaseOp));
22165 ShiftAmt |= C->getZExtValue() << (i * (1 << (6 - RatioInLog2)));
22168 // Check remaining shift amounts (if not a splat).
22169 if (SplatIndex < 0) {
22170 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22171 uint64_t ShAmt = 0;
22172 for (unsigned j = 0; j != Ratio; ++j) {
22173 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Amt.getOperand(i + j));
22177 ShAmt |= C->getZExtValue() << (j * (1 << (6 - RatioInLog2)));
22179 if (ShAmt != ShiftAmt)
22184 if (SupportedVectorShiftWithImm(VT, Subtarget, Op.getOpcode()))
22185 return getTargetVShiftByConstNode(X86Opc, dl, VT, R, ShiftAmt, DAG);
22187 if (Op.getOpcode() == ISD::SRA)
22188 return ArithmeticShiftRight64(ShiftAmt);
22194 static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
22195 const X86Subtarget &Subtarget) {
22196 MVT VT = Op.getSimpleValueType();
22198 SDValue R = Op.getOperand(0);
22199 SDValue Amt = Op.getOperand(1);
22201 unsigned X86OpcI = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHLI :
22202 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRLI : X86ISD::VSRAI;
22204 unsigned X86OpcV = (Op.getOpcode() == ISD::SHL) ? X86ISD::VSHL :
22205 (Op.getOpcode() == ISD::SRL) ? X86ISD::VSRL : X86ISD::VSRA;
22207 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode())) {
22209 MVT EltVT = VT.getVectorElementType();
22211 if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Amt)) {
22212 // Check if this build_vector node is doing a splat.
22213 // If so, then set BaseShAmt equal to the splat value.
22214 BaseShAmt = BV->getSplatValue();
22215 if (BaseShAmt && BaseShAmt.isUndef())
22216 BaseShAmt = SDValue();
22218 if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR)
22219 Amt = Amt.getOperand(0);
22221 ShuffleVectorSDNode *SVN = dyn_cast<ShuffleVectorSDNode>(Amt);
22222 if (SVN && SVN->isSplat()) {
22223 unsigned SplatIdx = (unsigned)SVN->getSplatIndex();
22224 SDValue InVec = Amt.getOperand(0);
22225 if (InVec.getOpcode() == ISD::BUILD_VECTOR) {
22226 assert((SplatIdx < InVec.getSimpleValueType().getVectorNumElements()) &&
22227 "Unexpected shuffle index found!");
22228 BaseShAmt = InVec.getOperand(SplatIdx);
22229 } else if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT) {
22230 if (ConstantSDNode *C =
22231 dyn_cast<ConstantSDNode>(InVec.getOperand(2))) {
22232 if (C->getZExtValue() == SplatIdx)
22233 BaseShAmt = InVec.getOperand(1);
22238 // Avoid introducing an extract element from a shuffle.
22239 BaseShAmt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InVec,
22240 DAG.getIntPtrConstant(SplatIdx, dl));
22244 if (BaseShAmt.getNode()) {
22245 assert(EltVT.bitsLE(MVT::i64) && "Unexpected element type!");
22246 if (EltVT != MVT::i64 && EltVT.bitsGT(MVT::i32))
22247 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, BaseShAmt);
22248 else if (EltVT.bitsLT(MVT::i32))
22249 BaseShAmt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, BaseShAmt);
22251 return getTargetVShiftNode(X86OpcI, dl, VT, R, BaseShAmt, Subtarget, DAG);
22255 // Special case in 32-bit mode, where i64 is expanded into high and low parts.
22256 if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
22257 Amt.getOpcode() == ISD::BITCAST &&
22258 Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
22259 Amt = Amt.getOperand(0);
22260 unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
22261 VT.getVectorNumElements();
22262 std::vector<SDValue> Vals(Ratio);
22263 for (unsigned i = 0; i != Ratio; ++i)
22264 Vals[i] = Amt.getOperand(i);
22265 for (unsigned i = Ratio; i != Amt.getNumOperands(); i += Ratio) {
22266 for (unsigned j = 0; j != Ratio; ++j)
22267 if (Vals[j] != Amt.getOperand(i + j))
22271 if (SupportedVectorShiftWithBaseAmnt(VT, Subtarget, Op.getOpcode()))
22272 return DAG.getNode(X86OpcV, dl, VT, R, Op.getOperand(1));
22277 static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
22278 SelectionDAG &DAG) {
22279 MVT VT = Op.getSimpleValueType();
22281 SDValue R = Op.getOperand(0);
22282 SDValue Amt = Op.getOperand(1);
22283 bool ConstantAmt = ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22285 assert(VT.isVector() && "Custom lowering only for vector shifts!");
22286 assert(Subtarget.hasSSE2() && "Only custom lower when we have SSE2!");
22288 if (SDValue V = LowerScalarImmediateShift(Op, DAG, Subtarget))
22291 if (SDValue V = LowerScalarVariableShift(Op, DAG, Subtarget))
22294 if (SupportedVectorVarShift(VT, Subtarget, Op.getOpcode()))
22297 // XOP has 128-bit variable logical/arithmetic shifts.
22298 // +ve/-ve Amt = shift left/right.
22299 if (Subtarget.hasXOP() &&
22300 (VT == MVT::v2i64 || VT == MVT::v4i32 ||
22301 VT == MVT::v8i16 || VT == MVT::v16i8)) {
22302 if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SRA) {
22303 SDValue Zero = getZeroVector(VT, Subtarget, DAG, dl);
22304 Amt = DAG.getNode(ISD::SUB, dl, VT, Zero, Amt);
22306 if (Op.getOpcode() == ISD::SHL || Op.getOpcode() == ISD::SRL)
22307 return DAG.getNode(X86ISD::VPSHL, dl, VT, R, Amt);
22308 if (Op.getOpcode() == ISD::SRA)
22309 return DAG.getNode(X86ISD::VPSHA, dl, VT, R, Amt);
22312 // 2i64 vector logical shifts can efficiently avoid scalarization - do the
22313 // shifts per-lane and then shuffle the partial results back together.
22314 if (VT == MVT::v2i64 && Op.getOpcode() != ISD::SRA) {
22315 // Splat the shift amounts so the scalar shifts above will catch it.
22316 SDValue Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {0, 0});
22317 SDValue Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Amt, {1, 1});
22318 SDValue R0 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt0);
22319 SDValue R1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Amt1);
22320 return DAG.getVectorShuffle(VT, dl, R0, R1, {0, 3});
22323 // i64 vector arithmetic shift can be emulated with the transform:
22324 // M = lshr(SIGN_MASK, Amt)
22325 // ashr(R, Amt) === sub(xor(lshr(R, Amt), M), M)
22326 if ((VT == MVT::v2i64 || (VT == MVT::v4i64 && Subtarget.hasInt256())) &&
22327 Op.getOpcode() == ISD::SRA) {
22328 SDValue S = DAG.getConstant(APInt::getSignMask(64), dl, VT);
22329 SDValue M = DAG.getNode(ISD::SRL, dl, VT, S, Amt);
22330 R = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
22331 R = DAG.getNode(ISD::XOR, dl, VT, R, M);
22332 R = DAG.getNode(ISD::SUB, dl, VT, R, M);
22336 // If possible, lower this packed shift into a vector multiply instead of
22337 // expanding it into a sequence of scalar shifts.
22338 // Do this only if the vector shift count is a constant build_vector.
22339 if (ConstantAmt && Op.getOpcode() == ISD::SHL &&
22340 (VT == MVT::v8i16 || VT == MVT::v4i32 ||
22341 (Subtarget.hasInt256() && VT == MVT::v16i16))) {
22342 SmallVector<SDValue, 8> Elts;
22343 MVT SVT = VT.getVectorElementType();
22344 unsigned SVTBits = SVT.getSizeInBits();
22345 APInt One(SVTBits, 1);
22346 unsigned NumElems = VT.getVectorNumElements();
22348 for (unsigned i=0; i !=NumElems; ++i) {
22349 SDValue Op = Amt->getOperand(i);
22350 if (Op->isUndef()) {
22351 Elts.push_back(Op);
22355 ConstantSDNode *ND = cast<ConstantSDNode>(Op);
22356 APInt C(SVTBits, ND->getAPIntValue().getZExtValue());
22357 uint64_t ShAmt = C.getZExtValue();
22358 if (ShAmt >= SVTBits) {
22359 Elts.push_back(DAG.getUNDEF(SVT));
22362 Elts.push_back(DAG.getConstant(One.shl(ShAmt), dl, SVT));
22364 SDValue BV = DAG.getBuildVector(VT, dl, Elts);
22365 return DAG.getNode(ISD::MUL, dl, VT, R, BV);
22368 // Lower SHL with variable shift amount.
22369 if (VT == MVT::v4i32 && Op->getOpcode() == ISD::SHL) {
22370 Op = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(23, dl, VT));
22372 Op = DAG.getNode(ISD::ADD, dl, VT, Op,
22373 DAG.getConstant(0x3f800000U, dl, VT));
22374 Op = DAG.getBitcast(MVT::v4f32, Op);
22375 Op = DAG.getNode(ISD::FP_TO_SINT, dl, VT, Op);
22376 return DAG.getNode(ISD::MUL, dl, VT, Op, R);
22379 // If possible, lower this shift as a sequence of two shifts by
22380 // constant plus a MOVSS/MOVSD/PBLEND instead of scalarizing it.
22382 // (v4i32 (srl A, (build_vector < X, Y, Y, Y>)))
22384 // Could be rewritten as:
22385 // (v4i32 (MOVSS (srl A, <Y,Y,Y,Y>), (srl A, <X,X,X,X>)))
22387 // The advantage is that the two shifts from the example would be
22388 // lowered as X86ISD::VSRLI nodes. This would be cheaper than scalarizing
22389 // the vector shift into four scalar shifts plus four pairs of vector
22391 if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
22392 unsigned TargetOpcode = X86ISD::MOVSS;
22393 bool CanBeSimplified;
22394 // The splat value for the first packed shift (the 'X' from the example).
22395 SDValue Amt1 = Amt->getOperand(0);
22396 // The splat value for the second packed shift (the 'Y' from the example).
22397 SDValue Amt2 = (VT == MVT::v4i32) ? Amt->getOperand(1) : Amt->getOperand(2);
22399 // See if it is possible to replace this node with a sequence of
22400 // two shifts followed by a MOVSS/MOVSD/PBLEND.
22401 if (VT == MVT::v4i32) {
22402 // Check if it is legal to use a MOVSS.
22403 CanBeSimplified = Amt2 == Amt->getOperand(2) &&
22404 Amt2 == Amt->getOperand(3);
22405 if (!CanBeSimplified) {
22406 // Otherwise, check if we can still simplify this node using a MOVSD.
22407 CanBeSimplified = Amt1 == Amt->getOperand(1) &&
22408 Amt->getOperand(2) == Amt->getOperand(3);
22409 TargetOpcode = X86ISD::MOVSD;
22410 Amt2 = Amt->getOperand(2);
22413 // Do similar checks for the case where the machine value type
22415 CanBeSimplified = Amt1 == Amt->getOperand(1);
22416 for (unsigned i=3; i != 8 && CanBeSimplified; ++i)
22417 CanBeSimplified = Amt2 == Amt->getOperand(i);
22419 if (!CanBeSimplified) {
22420 TargetOpcode = X86ISD::MOVSD;
22421 CanBeSimplified = true;
22422 Amt2 = Amt->getOperand(4);
22423 for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
22424 CanBeSimplified = Amt1 == Amt->getOperand(i);
22425 for (unsigned j=4; j != 8 && CanBeSimplified; ++j)
22426 CanBeSimplified = Amt2 == Amt->getOperand(j);
22430 if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
22431 isa<ConstantSDNode>(Amt2)) {
22432 // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
22433 MVT CastVT = MVT::v4i32;
22435 DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
22436 SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
22438 DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
22439 SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
22440 SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
22441 SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
22442 if (TargetOpcode == X86ISD::MOVSD)
22443 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22444 BitCast2, {0, 1, 6, 7}));
22445 return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
22446 BitCast2, {0, 5, 6, 7}));
22450 // v4i32 Non Uniform Shifts.
22451 // If the shift amount is constant we can shift each lane using the SSE2
22452 // immediate shifts, else we need to zero-extend each lane to the lower i64
22453 // and shift using the SSE2 variable shifts.
22454 // The separate results can then be blended together.
22455 if (VT == MVT::v4i32) {
22456 unsigned Opc = Op.getOpcode();
22457 SDValue Amt0, Amt1, Amt2, Amt3;
22459 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {0, 0, 0, 0});
22460 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {1, 1, 1, 1});
22461 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {2, 2, 2, 2});
22462 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, DAG.getUNDEF(VT), {3, 3, 3, 3});
22464 // ISD::SHL is handled above but we include it here for completeness.
22467 llvm_unreachable("Unknown target vector shift node");
22469 Opc = X86ISD::VSHL;
22472 Opc = X86ISD::VSRL;
22475 Opc = X86ISD::VSRA;
22478 // The SSE2 shifts use the lower i64 as the same shift amount for
22479 // all lanes and the upper i64 is ignored. These shuffle masks
22480 // optimally zero-extend each lanes on SSE2/SSE41/AVX targets.
22481 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22482 Amt0 = DAG.getVectorShuffle(VT, dl, Amt, Z, {0, 4, -1, -1});
22483 Amt1 = DAG.getVectorShuffle(VT, dl, Amt, Z, {1, 5, -1, -1});
22484 Amt2 = DAG.getVectorShuffle(VT, dl, Amt, Z, {2, 6, -1, -1});
22485 Amt3 = DAG.getVectorShuffle(VT, dl, Amt, Z, {3, 7, -1, -1});
22488 SDValue R0 = DAG.getNode(Opc, dl, VT, R, Amt0);
22489 SDValue R1 = DAG.getNode(Opc, dl, VT, R, Amt1);
22490 SDValue R2 = DAG.getNode(Opc, dl, VT, R, Amt2);
22491 SDValue R3 = DAG.getNode(Opc, dl, VT, R, Amt3);
22492 SDValue R02 = DAG.getVectorShuffle(VT, dl, R0, R2, {0, -1, 6, -1});
22493 SDValue R13 = DAG.getVectorShuffle(VT, dl, R1, R3, {-1, 1, -1, 7});
22494 return DAG.getVectorShuffle(VT, dl, R02, R13, {0, 5, 2, 7});
22497 // It's worth extending once and using the vXi16/vXi32 shifts for smaller
22498 // types, but without AVX512 the extra overheads to get from vXi8 to vXi32
22499 // make the existing SSE solution better.
22500 if ((Subtarget.hasInt256() && VT == MVT::v8i16) ||
22501 (Subtarget.hasAVX512() && VT == MVT::v16i16) ||
22502 (Subtarget.hasAVX512() && VT == MVT::v16i8) ||
22503 (Subtarget.hasBWI() && VT == MVT::v32i8)) {
22504 MVT EvtSVT = (VT == MVT::v32i8 ? MVT::i16 : MVT::i32);
22505 MVT ExtVT = MVT::getVectorVT(EvtSVT, VT.getVectorNumElements());
22507 Op.getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
22508 R = DAG.getNode(ExtOpc, dl, ExtVT, R);
22509 Amt = DAG.getNode(ISD::ANY_EXTEND, dl, ExtVT, Amt);
22510 return DAG.getNode(ISD::TRUNCATE, dl, VT,
22511 DAG.getNode(Op.getOpcode(), dl, ExtVT, R, Amt));
22514 if (VT == MVT::v16i8 ||
22515 (VT == MVT::v32i8 && Subtarget.hasInt256() && !Subtarget.hasXOP()) ||
22516 (VT == MVT::v64i8 && Subtarget.hasBWI())) {
22517 MVT ExtVT = MVT::getVectorVT(MVT::i16, VT.getVectorNumElements() / 2);
22518 unsigned ShiftOpcode = Op->getOpcode();
22520 auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
22521 if (VT.is512BitVector()) {
22522 // On AVX512BW targets we make use of the fact that VSELECT lowers
22523 // to a masked blend which selects bytes based just on the sign bit
22524 // extracted to a mask.
22525 MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements());
22526 V0 = DAG.getBitcast(VT, V0);
22527 V1 = DAG.getBitcast(VT, V1);
22528 Sel = DAG.getBitcast(VT, Sel);
22529 Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel);
22530 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22531 } else if (Subtarget.hasSSE41()) {
22532 // On SSE41 targets we make use of the fact that VSELECT lowers
22533 // to PBLENDVB which selects bytes based just on the sign bit.
22534 V0 = DAG.getBitcast(VT, V0);
22535 V1 = DAG.getBitcast(VT, V1);
22536 Sel = DAG.getBitcast(VT, Sel);
22537 return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
22539 // On pre-SSE41 targets we test for the sign bit by comparing to
22540 // zero - a negative value will set all bits of the lanes to true
22541 // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering.
22542 SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl);
22543 SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel);
22544 return DAG.getSelect(dl, SelVT, C, V0, V1);
22547 // Turn 'a' into a mask suitable for VSELECT: a = a << 5;
22548 // We can safely do this using i16 shifts as we're only interested in
22549 // the 3 lower bits of each byte.
22550 Amt = DAG.getBitcast(ExtVT, Amt);
22551 Amt = DAG.getNode(ISD::SHL, dl, ExtVT, Amt, DAG.getConstant(5, dl, ExtVT));
22552 Amt = DAG.getBitcast(VT, Amt);
22554 if (Op->getOpcode() == ISD::SHL || Op->getOpcode() == ISD::SRL) {
22555 // r = VSELECT(r, shift(r, 4), a);
22557 DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22558 R = SignBitSelect(VT, Amt, M, R);
22561 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22563 // r = VSELECT(r, shift(r, 2), a);
22564 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22565 R = SignBitSelect(VT, Amt, M, R);
22568 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22570 // return VSELECT(r, shift(r, 1), a);
22571 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22572 R = SignBitSelect(VT, Amt, M, R);
22576 if (Op->getOpcode() == ISD::SRA) {
22577 // For SRA we need to unpack each byte to the higher byte of a i16 vector
22578 // so we can correctly sign extend. We don't care what happens to the
22580 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), Amt);
22581 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), Amt);
22582 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, DAG.getUNDEF(VT), R);
22583 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, DAG.getUNDEF(VT), R);
22584 ALo = DAG.getBitcast(ExtVT, ALo);
22585 AHi = DAG.getBitcast(ExtVT, AHi);
22586 RLo = DAG.getBitcast(ExtVT, RLo);
22587 RHi = DAG.getBitcast(ExtVT, RHi);
22589 // r = VSELECT(r, shift(r, 4), a);
22590 SDValue MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22591 DAG.getConstant(4, dl, ExtVT));
22592 SDValue MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22593 DAG.getConstant(4, dl, ExtVT));
22594 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22595 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22598 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22599 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22601 // r = VSELECT(r, shift(r, 2), a);
22602 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22603 DAG.getConstant(2, dl, ExtVT));
22604 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22605 DAG.getConstant(2, dl, ExtVT));
22606 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22607 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22610 ALo = DAG.getNode(ISD::ADD, dl, ExtVT, ALo, ALo);
22611 AHi = DAG.getNode(ISD::ADD, dl, ExtVT, AHi, AHi);
22613 // r = VSELECT(r, shift(r, 1), a);
22614 MLo = DAG.getNode(ShiftOpcode, dl, ExtVT, RLo,
22615 DAG.getConstant(1, dl, ExtVT));
22616 MHi = DAG.getNode(ShiftOpcode, dl, ExtVT, RHi,
22617 DAG.getConstant(1, dl, ExtVT));
22618 RLo = SignBitSelect(ExtVT, ALo, MLo, RLo);
22619 RHi = SignBitSelect(ExtVT, AHi, MHi, RHi);
22621 // Logical shift the result back to the lower byte, leaving a zero upper
22623 // meaning that we can safely pack with PACKUSWB.
22625 DAG.getNode(ISD::SRL, dl, ExtVT, RLo, DAG.getConstant(8, dl, ExtVT));
22627 DAG.getNode(ISD::SRL, dl, ExtVT, RHi, DAG.getConstant(8, dl, ExtVT));
22628 return DAG.getNode(X86ISD::PACKUS, dl, VT, RLo, RHi);
22632 if (Subtarget.hasInt256() && !Subtarget.hasXOP() && VT == MVT::v16i16) {
22633 MVT ExtVT = MVT::v8i32;
22634 SDValue Z = getZeroVector(VT, Subtarget, DAG, dl);
22635 SDValue ALo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Amt, Z);
22636 SDValue AHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Amt, Z);
22637 SDValue RLo = DAG.getNode(X86ISD::UNPCKL, dl, VT, Z, R);
22638 SDValue RHi = DAG.getNode(X86ISD::UNPCKH, dl, VT, Z, R);
22639 ALo = DAG.getBitcast(ExtVT, ALo);
22640 AHi = DAG.getBitcast(ExtVT, AHi);
22641 RLo = DAG.getBitcast(ExtVT, RLo);
22642 RHi = DAG.getBitcast(ExtVT, RHi);
22643 SDValue Lo = DAG.getNode(Op.getOpcode(), dl, ExtVT, RLo, ALo);
22644 SDValue Hi = DAG.getNode(Op.getOpcode(), dl, ExtVT, RHi, AHi);
22645 Lo = DAG.getNode(ISD::SRL, dl, ExtVT, Lo, DAG.getConstant(16, dl, ExtVT));
22646 Hi = DAG.getNode(ISD::SRL, dl, ExtVT, Hi, DAG.getConstant(16, dl, ExtVT));
22647 return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
22650 if (VT == MVT::v8i16) {
22651 unsigned ShiftOpcode = Op->getOpcode();
22653 // If we have a constant shift amount, the non-SSE41 path is best as
22654 // avoiding bitcasts make it easier to constant fold and reduce to PBLENDW.
22655 bool UseSSE41 = Subtarget.hasSSE41() &&
22656 !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
22658 auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
22659 // On SSE41 targets we make use of the fact that VSELECT lowers
22660 // to PBLENDVB which selects bytes based just on the sign bit.
22662 MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
22663 V0 = DAG.getBitcast(ExtVT, V0);
22664 V1 = DAG.getBitcast(ExtVT, V1);
22665 Sel = DAG.getBitcast(ExtVT, Sel);
22666 return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
22668 // On pre-SSE41 targets we splat the sign bit - a negative value will
22669 // set all bits of the lanes to true and VSELECT uses that in
22670 // its OR(AND(V0,C),AND(V1,~C)) lowering.
22672 DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT));
22673 return DAG.getSelect(dl, VT, C, V0, V1);
22676 // Turn 'a' into a mask suitable for VSELECT: a = a << 12;
22678 // On SSE41 targets we need to replicate the shift mask in both
22679 // bytes for PBLENDVB.
22682 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(4, dl, VT)),
22683 DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT)));
22685 Amt = DAG.getNode(ISD::SHL, dl, VT, Amt, DAG.getConstant(12, dl, VT));
22688 // r = VSELECT(r, shift(r, 8), a);
22689 SDValue M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(8, dl, VT));
22690 R = SignBitSelect(Amt, M, R);
22693 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22695 // r = VSELECT(r, shift(r, 4), a);
22696 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(4, dl, VT));
22697 R = SignBitSelect(Amt, M, R);
22700 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22702 // r = VSELECT(r, shift(r, 2), a);
22703 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(2, dl, VT));
22704 R = SignBitSelect(Amt, M, R);
22707 Amt = DAG.getNode(ISD::ADD, dl, VT, Amt, Amt);
22709 // return VSELECT(r, shift(r, 1), a);
22710 M = DAG.getNode(ShiftOpcode, dl, VT, R, DAG.getConstant(1, dl, VT));
22711 R = SignBitSelect(Amt, M, R);
22715 // Decompose 256-bit shifts into smaller 128-bit shifts.
22716 if (VT.is256BitVector())
22717 return Lower256IntArith(Op, DAG);
22722 static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
22723 SelectionDAG &DAG) {
22724 MVT VT = Op.getSimpleValueType();
22726 SDValue R = Op.getOperand(0);
22727 SDValue Amt = Op.getOperand(1);
22728 unsigned Opcode = Op.getOpcode();
22729 unsigned EltSizeInBits = VT.getScalarSizeInBits();
22731 if (Subtarget.hasAVX512()) {
22732 // Attempt to rotate by immediate.
22734 SmallVector<APInt, 16> EltBits;
22735 if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) {
22736 if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) {
22737 return EltBits[0] == V;
22739 unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
22740 uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits);
22741 return DAG.getNode(Op, DL, VT, R,
22742 DAG.getConstant(RotateAmt, DL, MVT::i8));
22746 // Else, fall-back on VPROLV/VPRORV.
22750 assert(VT.isVector() && "Custom lowering only for vector rotates!");
22751 assert(Subtarget.hasXOP() && "XOP support required for vector rotates!");
22752 assert((Opcode == ISD::ROTL) && "Only ROTL supported");
22754 // XOP has 128-bit vector variable + immediate rotates.
22755 // +ve/-ve Amt = rotate left/right.
22757 // Split 256-bit integers.
22758 if (VT.is256BitVector())
22759 return Lower256IntArith(Op, DAG);
22761 assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
22763 // Attempt to rotate by immediate.
22764 if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) {
22765 if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
22766 uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
22767 assert(RotateAmt < EltSizeInBits && "Rotation out of range");
22768 return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
22769 DAG.getConstant(RotateAmt, DL, MVT::i8));
22773 // Use general rotate by variable (per-element).
22774 return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
22777 static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
22778 // Lower the "add/sub/mul with overflow" instruction into a regular ins plus
22779 // a "setcc" instruction that checks the overflow flag. The "brcond" lowering
22780 // looks for this combo and may remove the "setcc" instruction if the "setcc"
22781 // has only one use.
22782 SDNode *N = Op.getNode();
22783 SDValue LHS = N->getOperand(0);
22784 SDValue RHS = N->getOperand(1);
22785 unsigned BaseOp = 0;
22786 X86::CondCode Cond;
22788 switch (Op.getOpcode()) {
22789 default: llvm_unreachable("Unknown ovf instruction!");
22791 // A subtract of one will be selected as a INC. Note that INC doesn't
22792 // set CF, so we can't do this for UADDO.
22793 if (isOneConstant(RHS)) {
22794 BaseOp = X86ISD::INC;
22795 Cond = X86::COND_O;
22798 BaseOp = X86ISD::ADD;
22799 Cond = X86::COND_O;
22802 BaseOp = X86ISD::ADD;
22803 Cond = X86::COND_B;
22806 // A subtract of one will be selected as a DEC. Note that DEC doesn't
22807 // set CF, so we can't do this for USUBO.
22808 if (isOneConstant(RHS)) {
22809 BaseOp = X86ISD::DEC;
22810 Cond = X86::COND_O;
22813 BaseOp = X86ISD::SUB;
22814 Cond = X86::COND_O;
22817 BaseOp = X86ISD::SUB;
22818 Cond = X86::COND_B;
22821 BaseOp = N->getValueType(0) == MVT::i8 ? X86ISD::SMUL8 : X86ISD::SMUL;
22822 Cond = X86::COND_O;
22824 case ISD::UMULO: { // i64, i8 = umulo lhs, rhs --> i64, i64, i32 umul lhs,rhs
22825 if (N->getValueType(0) == MVT::i8) {
22826 BaseOp = X86ISD::UMUL8;
22827 Cond = X86::COND_O;
22830 SDVTList VTs = DAG.getVTList(N->getValueType(0), N->getValueType(0),
22832 SDValue Sum = DAG.getNode(X86ISD::UMUL, DL, VTs, LHS, RHS);
22834 SDValue SetCC = getSETCC(X86::COND_O, SDValue(Sum.getNode(), 2), DL, DAG);
22836 if (N->getValueType(1) == MVT::i1)
22837 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22839 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22843 // Also sets EFLAGS.
22844 SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
22845 SDValue Sum = DAG.getNode(BaseOp, DL, VTs, LHS, RHS);
22847 SDValue SetCC = getSETCC(Cond, SDValue(Sum.getNode(), 1), DL, DAG);
22849 if (N->getValueType(1) == MVT::i1)
22850 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
22852 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
22855 /// Returns true if the operand type is exactly twice the native width, and
22856 /// the corresponding cmpxchg8b or cmpxchg16b instruction is available.
22857 /// Used to know whether to use cmpxchg8/16b when expanding atomic operations
22858 /// (otherwise we leave them alone to become __sync_fetch_and_... calls).
22859 bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
22860 unsigned OpWidth = MemType->getPrimitiveSizeInBits();
22863 return !Subtarget.is64Bit(); // FIXME this should be Subtarget.hasCmpxchg8b
22864 else if (OpWidth == 128)
22865 return Subtarget.hasCmpxchg16b();
22870 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
22871 return needsCmpXchgNb(SI->getValueOperand()->getType());
22874 // Note: this turns large loads into lock cmpxchg8b/16b.
22875 // FIXME: On 32 bits x86, fild/movq might be faster than lock cmpxchg8b.
22876 TargetLowering::AtomicExpansionKind
22877 X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
22878 auto PTy = cast<PointerType>(LI->getPointerOperandType());
22879 return needsCmpXchgNb(PTy->getElementType()) ? AtomicExpansionKind::CmpXChg
22880 : AtomicExpansionKind::None;
22883 TargetLowering::AtomicExpansionKind
22884 X86TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const {
22885 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22886 Type *MemType = AI->getType();
22888 // If the operand is too big, we must see if cmpxchg8/16b is available
22889 // and default to library calls otherwise.
22890 if (MemType->getPrimitiveSizeInBits() > NativeWidth) {
22891 return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
22892 : AtomicExpansionKind::None;
22895 AtomicRMWInst::BinOp Op = AI->getOperation();
22898 llvm_unreachable("Unknown atomic operation");
22899 case AtomicRMWInst::Xchg:
22900 case AtomicRMWInst::Add:
22901 case AtomicRMWInst::Sub:
22902 // It's better to use xadd, xsub or xchg for these in all cases.
22903 return AtomicExpansionKind::None;
22904 case AtomicRMWInst::Or:
22905 case AtomicRMWInst::And:
22906 case AtomicRMWInst::Xor:
22907 // If the atomicrmw's result isn't actually used, we can just add a "lock"
22908 // prefix to a normal instruction for these operations.
22909 return !AI->use_empty() ? AtomicExpansionKind::CmpXChg
22910 : AtomicExpansionKind::None;
22911 case AtomicRMWInst::Nand:
22912 case AtomicRMWInst::Max:
22913 case AtomicRMWInst::Min:
22914 case AtomicRMWInst::UMax:
22915 case AtomicRMWInst::UMin:
22916 // These always require a non-trivial set of data operations on x86. We must
22917 // use a cmpxchg loop.
22918 return AtomicExpansionKind::CmpXChg;
22923 X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
22924 unsigned NativeWidth = Subtarget.is64Bit() ? 64 : 32;
22925 Type *MemType = AI->getType();
22926 // Accesses larger than the native width are turned into cmpxchg/libcalls, so
22927 // there is no benefit in turning such RMWs into loads, and it is actually
22928 // harmful as it introduces a mfence.
22929 if (MemType->getPrimitiveSizeInBits() > NativeWidth)
22932 auto Builder = IRBuilder<>(AI);
22933 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
22934 auto SSID = AI->getSyncScopeID();
22935 // We must restrict the ordering to avoid generating loads with Release or
22936 // ReleaseAcquire orderings.
22937 auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering());
22938 auto Ptr = AI->getPointerOperand();
22940 // Before the load we need a fence. Here is an example lifted from
22941 // http://www.hpl.hp.com/techreports/2012/HPL-2012-68.pdf showing why a fence
22944 // x.store(1, relaxed);
22945 // r1 = y.fetch_add(0, release);
22947 // y.fetch_add(42, acquire);
22948 // r2 = x.load(relaxed);
22949 // r1 = r2 = 0 is impossible, but becomes possible if the idempotent rmw is
22950 // lowered to just a load without a fence. A mfence flushes the store buffer,
22951 // making the optimization clearly correct.
22952 // FIXME: it is required if isReleaseOrStronger(Order) but it is not clear
22953 // otherwise, we might be able to be more aggressive on relaxed idempotent
22954 // rmw. In practice, they do not look useful, so we don't try to be
22955 // especially clever.
22956 if (SSID == SyncScope::SingleThread)
22957 // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at
22958 // the IR level, so we must wrap it in an intrinsic.
22961 if (!Subtarget.hasMFence())
22962 // FIXME: it might make sense to use a locked operation here but on a
22963 // different cache-line to prevent cache-line bouncing. In practice it
22964 // is probably a small win, and x86 processors without mfence are rare
22965 // enough that we do not bother.
22969 llvm::Intrinsic::getDeclaration(M, Intrinsic::x86_sse2_mfence);
22970 Builder.CreateCall(MFence, {});
22972 // Finally we can emit the atomic load.
22973 LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr,
22974 AI->getType()->getPrimitiveSizeInBits());
22975 Loaded->setAtomic(Order, SSID);
22976 AI->replaceAllUsesWith(Loaded);
22977 AI->eraseFromParent();
22981 static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget,
22982 SelectionDAG &DAG) {
22984 AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>(
22985 cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue());
22986 SyncScope::ID FenceSSID = static_cast<SyncScope::ID>(
22987 cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
22989 // The only fence that needs an instruction is a sequentially-consistent
22990 // cross-thread fence.
22991 if (FenceOrdering == AtomicOrdering::SequentiallyConsistent &&
22992 FenceSSID == SyncScope::System) {
22993 if (Subtarget.hasMFence())
22994 return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0));
22996 SDValue Chain = Op.getOperand(0);
22997 SDValue Zero = DAG.getConstant(0, dl, MVT::i32);
22999 DAG.getRegister(X86::ESP, MVT::i32), // Base
23000 DAG.getTargetConstant(1, dl, MVT::i8), // Scale
23001 DAG.getRegister(0, MVT::i32), // Index
23002 DAG.getTargetConstant(0, dl, MVT::i32), // Disp
23003 DAG.getRegister(0, MVT::i32), // Segment.
23007 SDNode *Res = DAG.getMachineNode(X86::OR32mrLocked, dl, MVT::Other, Ops);
23008 return SDValue(Res, 0);
23011 // MEMBARRIER is a compiler barrier; it codegens to a no-op.
23012 return DAG.getNode(X86ISD::MEMBARRIER, dl, MVT::Other, Op.getOperand(0));
23015 static SDValue LowerCMP_SWAP(SDValue Op, const X86Subtarget &Subtarget,
23016 SelectionDAG &DAG) {
23017 MVT T = Op.getSimpleValueType();
23021 switch(T.SimpleTy) {
23022 default: llvm_unreachable("Invalid value type!");
23023 case MVT::i8: Reg = X86::AL; size = 1; break;
23024 case MVT::i16: Reg = X86::AX; size = 2; break;
23025 case MVT::i32: Reg = X86::EAX; size = 4; break;
23027 assert(Subtarget.is64Bit() && "Node not type legal!");
23028 Reg = X86::RAX; size = 8;
23031 SDValue cpIn = DAG.getCopyToReg(Op.getOperand(0), DL, Reg,
23032 Op.getOperand(2), SDValue());
23033 SDValue Ops[] = { cpIn.getValue(0),
23036 DAG.getTargetConstant(size, DL, MVT::i8),
23037 cpIn.getValue(1) };
23038 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
23039 MachineMemOperand *MMO = cast<AtomicSDNode>(Op)->getMemOperand();
23040 SDValue Result = DAG.getMemIntrinsicNode(X86ISD::LCMPXCHG_DAG, DL, Tys,
23044 DAG.getCopyFromReg(Result.getValue(0), DL, Reg, T, Result.getValue(1));
23045 SDValue EFLAGS = DAG.getCopyFromReg(cpOut.getValue(1), DL, X86::EFLAGS,
23046 MVT::i32, cpOut.getValue(2));
23047 SDValue Success = getSETCC(X86::COND_E, EFLAGS, DL, DAG);
23049 DAG.ReplaceAllUsesOfValueWith(Op.getValue(0), cpOut);
23050 DAG.ReplaceAllUsesOfValueWith(Op.getValue(1), Success);
23051 DAG.ReplaceAllUsesOfValueWith(Op.getValue(2), EFLAGS.getValue(1));
23055 static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
23056 SelectionDAG &DAG) {
23057 MVT SrcVT = Op.getOperand(0).getSimpleValueType();
23058 MVT DstVT = Op.getSimpleValueType();
23060 if (SrcVT == MVT::v2i32 || SrcVT == MVT::v4i16 || SrcVT == MVT::v8i8 ||
23061 SrcVT == MVT::i64) {
23062 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
23063 if (DstVT != MVT::f64)
23064 // This conversion needs to be expanded.
23067 SDValue Op0 = Op->getOperand(0);
23068 SmallVector<SDValue, 16> Elts;
23072 if (SrcVT.isVector()) {
23073 NumElts = SrcVT.getVectorNumElements();
23074 SVT = SrcVT.getVectorElementType();
23076 // Widen the vector in input in the case of MVT::v2i32.
23077 // Example: from MVT::v2i32 to MVT::v4i32.
23078 for (unsigned i = 0, e = NumElts; i != e; ++i)
23079 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT, Op0,
23080 DAG.getIntPtrConstant(i, dl)));
23082 assert(SrcVT == MVT::i64 && !Subtarget.is64Bit() &&
23083 "Unexpected source type in LowerBITCAST");
23084 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23085 DAG.getIntPtrConstant(0, dl)));
23086 Elts.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, Op0,
23087 DAG.getIntPtrConstant(1, dl)));
23091 // Explicitly mark the extra elements as Undef.
23092 Elts.append(NumElts, DAG.getUNDEF(SVT));
23094 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
23095 SDValue BV = DAG.getBuildVector(NewVT, dl, Elts);
23096 SDValue ToV2F64 = DAG.getBitcast(MVT::v2f64, BV);
23097 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, ToV2F64,
23098 DAG.getIntPtrConstant(0, dl));
23101 assert(Subtarget.is64Bit() && !Subtarget.hasSSE2() &&
23102 Subtarget.hasMMX() && "Unexpected custom BITCAST");
23103 assert((DstVT == MVT::i64 ||
23104 (DstVT.isVector() && DstVT.getSizeInBits()==64)) &&
23105 "Unexpected custom BITCAST");
23106 // i64 <=> MMX conversions are Legal.
23107 if (SrcVT==MVT::i64 && DstVT.isVector())
23109 if (DstVT==MVT::i64 && SrcVT.isVector())
23111 // MMX <=> MMX conversions are Legal.
23112 if (SrcVT.isVector() && DstVT.isVector())
23114 // All other conversions need to be expanded.
23118 /// Compute the horizontal sum of bytes in V for the elements of VT.
23120 /// Requires V to be a byte vector and VT to be an integer vector type with
23121 /// wider elements than V's type. The width of the elements of VT determines
23122 /// how many bytes of V are summed horizontally to produce each element of the
23124 static SDValue LowerHorizontalByteSum(SDValue V, MVT VT,
23125 const X86Subtarget &Subtarget,
23126 SelectionDAG &DAG) {
23128 MVT ByteVecVT = V.getSimpleValueType();
23129 MVT EltVT = VT.getVectorElementType();
23130 assert(ByteVecVT.getVectorElementType() == MVT::i8 &&
23131 "Expected value to have byte element type.");
23132 assert(EltVT != MVT::i8 &&
23133 "Horizontal byte sum only makes sense for wider elements!");
23134 unsigned VecSize = VT.getSizeInBits();
23135 assert(ByteVecVT.getSizeInBits() == VecSize && "Cannot change vector size!");
23137 // PSADBW instruction horizontally add all bytes and leave the result in i64
23138 // chunks, thus directly computes the pop count for v2i64 and v4i64.
23139 if (EltVT == MVT::i64) {
23140 SDValue Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23141 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23142 V = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT, V, Zeros);
23143 return DAG.getBitcast(VT, V);
23146 if (EltVT == MVT::i32) {
23147 // We unpack the low half and high half into i32s interleaved with zeros so
23148 // that we can use PSADBW to horizontally sum them. The most useful part of
23149 // this is that it lines up the results of two PSADBW instructions to be
23150 // two v2i64 vectors which concatenated are the 4 population counts. We can
23151 // then use PACKUSWB to shrink and concatenate them into a v4i32 again.
23152 SDValue Zeros = getZeroVector(VT, Subtarget, DAG, DL);
23153 SDValue V32 = DAG.getBitcast(VT, V);
23154 SDValue Low = DAG.getNode(X86ISD::UNPCKL, DL, VT, V32, Zeros);
23155 SDValue High = DAG.getNode(X86ISD::UNPCKH, DL, VT, V32, Zeros);
23157 // Do the horizontal sums into two v2i64s.
23158 Zeros = getZeroVector(ByteVecVT, Subtarget, DAG, DL);
23159 MVT SadVecVT = MVT::getVectorVT(MVT::i64, VecSize / 64);
23160 Low = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23161 DAG.getBitcast(ByteVecVT, Low), Zeros);
23162 High = DAG.getNode(X86ISD::PSADBW, DL, SadVecVT,
23163 DAG.getBitcast(ByteVecVT, High), Zeros);
23165 // Merge them together.
23166 MVT ShortVecVT = MVT::getVectorVT(MVT::i16, VecSize / 16);
23167 V = DAG.getNode(X86ISD::PACKUS, DL, ByteVecVT,
23168 DAG.getBitcast(ShortVecVT, Low),
23169 DAG.getBitcast(ShortVecVT, High));
23171 return DAG.getBitcast(VT, V);
23174 // The only element type left is i16.
23175 assert(EltVT == MVT::i16 && "Unknown how to handle type");
23177 // To obtain pop count for each i16 element starting from the pop count for
23178 // i8 elements, shift the i16s left by 8, sum as i8s, and then shift as i16s
23179 // right by 8. It is important to shift as i16s as i8 vector shift isn't
23180 // directly supported.
23181 SDValue ShifterV = DAG.getConstant(8, DL, VT);
23182 SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23183 V = DAG.getNode(ISD::ADD, DL, ByteVecVT, DAG.getBitcast(ByteVecVT, Shl),
23184 DAG.getBitcast(ByteVecVT, V));
23185 return DAG.getNode(ISD::SRL, DL, VT, DAG.getBitcast(VT, V), ShifterV);
23188 static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
23189 const X86Subtarget &Subtarget,
23190 SelectionDAG &DAG) {
23191 MVT VT = Op.getSimpleValueType();
23192 MVT EltVT = VT.getVectorElementType();
23193 unsigned VecSize = VT.getSizeInBits();
23195 // Implement a lookup table in register by using an algorithm based on:
23196 // http://wm.ite.pl/articles/sse-popcount.html
23198 // The general idea is that every lower byte nibble in the input vector is an
23199 // index into a in-register pre-computed pop count table. We then split up the
23200 // input vector in two new ones: (1) a vector with only the shifted-right
23201 // higher nibbles for each byte and (2) a vector with the lower nibbles (and
23202 // masked out higher ones) for each byte. PSHUFB is used separately with both
23203 // to index the in-register table. Next, both are added and the result is a
23204 // i8 vector where each element contains the pop count for input byte.
23206 // To obtain the pop count for elements != i8, we follow up with the same
23207 // approach and use additional tricks as described below.
23209 const int LUT[16] = {/* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
23210 /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
23211 /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
23212 /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4};
23214 int NumByteElts = VecSize / 8;
23215 MVT ByteVecVT = MVT::getVectorVT(MVT::i8, NumByteElts);
23216 SDValue In = DAG.getBitcast(ByteVecVT, Op);
23217 SmallVector<SDValue, 64> LUTVec;
23218 for (int i = 0; i < NumByteElts; ++i)
23219 LUTVec.push_back(DAG.getConstant(LUT[i % 16], DL, MVT::i8));
23220 SDValue InRegLUT = DAG.getBuildVector(ByteVecVT, DL, LUTVec);
23221 SDValue M0F = DAG.getConstant(0x0F, DL, ByteVecVT);
23224 SDValue FourV = DAG.getConstant(4, DL, ByteVecVT);
23225 SDValue HighNibbles = DAG.getNode(ISD::SRL, DL, ByteVecVT, In, FourV);
23228 SDValue LowNibbles = DAG.getNode(ISD::AND, DL, ByteVecVT, In, M0F);
23230 // The input vector is used as the shuffle mask that index elements into the
23231 // LUT. After counting low and high nibbles, add the vector to obtain the
23232 // final pop count per i8 element.
23233 SDValue HighPopCnt =
23234 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, HighNibbles);
23235 SDValue LowPopCnt =
23236 DAG.getNode(X86ISD::PSHUFB, DL, ByteVecVT, InRegLUT, LowNibbles);
23237 SDValue PopCnt = DAG.getNode(ISD::ADD, DL, ByteVecVT, HighPopCnt, LowPopCnt);
23239 if (EltVT == MVT::i8)
23242 return LowerHorizontalByteSum(PopCnt, VT, Subtarget, DAG);
23245 static SDValue LowerVectorCTPOPBitmath(SDValue Op, const SDLoc &DL,
23246 const X86Subtarget &Subtarget,
23247 SelectionDAG &DAG) {
23248 MVT VT = Op.getSimpleValueType();
23249 assert(VT.is128BitVector() &&
23250 "Only 128-bit vector bitmath lowering supported.");
23252 int VecSize = VT.getSizeInBits();
23253 MVT EltVT = VT.getVectorElementType();
23254 int Len = EltVT.getSizeInBits();
23256 // This is the vectorized version of the "best" algorithm from
23257 // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
23258 // with a minor tweak to use a series of adds + shifts instead of vector
23259 // multiplications. Implemented for all integer vector types. We only use
23260 // this when we don't have SSSE3 which allows a LUT-based lowering that is
23261 // much faster, even faster than using native popcnt instructions.
23263 auto GetShift = [&](unsigned OpCode, SDValue V, int Shifter) {
23264 MVT VT = V.getSimpleValueType();
23265 SDValue ShifterV = DAG.getConstant(Shifter, DL, VT);
23266 return DAG.getNode(OpCode, DL, VT, V, ShifterV);
23268 auto GetMask = [&](SDValue V, APInt Mask) {
23269 MVT VT = V.getSimpleValueType();
23270 SDValue MaskV = DAG.getConstant(Mask, DL, VT);
23271 return DAG.getNode(ISD::AND, DL, VT, V, MaskV);
23274 // We don't want to incur the implicit masks required to SRL vNi8 vectors on
23275 // x86, so set the SRL type to have elements at least i16 wide. This is
23276 // correct because all of our SRLs are followed immediately by a mask anyways
23277 // that handles any bits that sneak into the high bits of the byte elements.
23278 MVT SrlVT = Len > 8 ? VT : MVT::getVectorVT(MVT::i16, VecSize / 16);
23282 // v = v - ((v >> 1) & 0x55555555...)
23284 DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 1));
23285 SDValue And = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x55)));
23286 V = DAG.getNode(ISD::SUB, DL, VT, V, And);
23288 // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
23289 SDValue AndLHS = GetMask(V, APInt::getSplat(Len, APInt(8, 0x33)));
23290 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 2));
23291 SDValue AndRHS = GetMask(Srl, APInt::getSplat(Len, APInt(8, 0x33)));
23292 V = DAG.getNode(ISD::ADD, DL, VT, AndLHS, AndRHS);
23294 // v = (v + (v >> 4)) & 0x0F0F0F0F...
23295 Srl = DAG.getBitcast(VT, GetShift(ISD::SRL, DAG.getBitcast(SrlVT, V), 4));
23296 SDValue Add = DAG.getNode(ISD::ADD, DL, VT, V, Srl);
23297 V = GetMask(Add, APInt::getSplat(Len, APInt(8, 0x0F)));
23299 // At this point, V contains the byte-wise population count, and we are
23300 // merely doing a horizontal sum if necessary to get the wider element
23302 if (EltVT == MVT::i8)
23305 return LowerHorizontalByteSum(
23306 DAG.getBitcast(MVT::getVectorVT(MVT::i8, VecSize / 8), V), VT, Subtarget,
23310 // Please ensure that any codegen change from LowerVectorCTPOP is reflected in
23311 // updated cost models in X86TTIImpl::getIntrinsicInstrCost.
23312 static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23313 SelectionDAG &DAG) {
23314 MVT VT = Op.getSimpleValueType();
23315 assert((VT.is512BitVector() || VT.is256BitVector() || VT.is128BitVector()) &&
23316 "Unknown CTPOP type to handle");
23317 SDLoc DL(Op.getNode());
23318 SDValue Op0 = Op.getOperand(0);
23320 // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
23321 if (Subtarget.hasVPOPCNTDQ()) {
23322 if (VT == MVT::v8i16) {
23323 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
23324 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
23325 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23327 if (VT == MVT::v16i8 || VT == MVT::v16i16) {
23328 Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
23329 Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
23330 return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
23334 if (!Subtarget.hasSSSE3()) {
23335 // We can't use the fast LUT approach, so fall back on vectorized bitmath.
23336 assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!");
23337 return LowerVectorCTPOPBitmath(Op0, DL, Subtarget, DAG);
23340 // Decompose 256-bit ops into smaller 128-bit ops.
23341 if (VT.is256BitVector() && !Subtarget.hasInt256())
23342 return Lower256IntUnary(Op, DAG);
23344 // Decompose 512-bit ops into smaller 256-bit ops.
23345 if (VT.is512BitVector() && !Subtarget.hasBWI())
23346 return Lower512IntUnary(Op, DAG);
23348 return LowerVectorCTPOPInRegLUT(Op0, DL, Subtarget, DAG);
23351 static SDValue LowerCTPOP(SDValue Op, const X86Subtarget &Subtarget,
23352 SelectionDAG &DAG) {
23353 assert(Op.getSimpleValueType().isVector() &&
23354 "We only do custom lowering for vector population count.");
23355 return LowerVectorCTPOP(Op, Subtarget, DAG);
23358 static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
23359 MVT VT = Op.getSimpleValueType();
23360 SDValue In = Op.getOperand(0);
23363 // For scalars, its still beneficial to transfer to/from the SIMD unit to
23364 // perform the BITREVERSE.
23365 if (!VT.isVector()) {
23366 MVT VecVT = MVT::getVectorVT(VT, 128 / VT.getSizeInBits());
23367 SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, In);
23368 Res = DAG.getNode(ISD::BITREVERSE, DL, VecVT, Res);
23369 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Res,
23370 DAG.getIntPtrConstant(0, DL));
23373 int NumElts = VT.getVectorNumElements();
23374 int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
23376 // Decompose 256-bit ops into smaller 128-bit ops.
23377 if (VT.is256BitVector())
23378 return Lower256IntUnary(Op, DAG);
23380 assert(VT.is128BitVector() &&
23381 "Only 128-bit vector bitreverse lowering supported.");
23383 // VPPERM reverses the bits of a byte with the permute Op (2 << 5), and we
23384 // perform the BSWAP in the shuffle.
23385 // Its best to shuffle using the second operand as this will implicitly allow
23386 // memory folding for multiple vectors.
23387 SmallVector<SDValue, 16> MaskElts;
23388 for (int i = 0; i != NumElts; ++i) {
23389 for (int j = ScalarSizeInBytes - 1; j >= 0; --j) {
23390 int SourceByte = 16 + (i * ScalarSizeInBytes) + j;
23391 int PermuteByte = SourceByte | (2 << 5);
23392 MaskElts.push_back(DAG.getConstant(PermuteByte, DL, MVT::i8));
23396 SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, MaskElts);
23397 SDValue Res = DAG.getBitcast(MVT::v16i8, In);
23398 Res = DAG.getNode(X86ISD::VPPERM, DL, MVT::v16i8, DAG.getUNDEF(MVT::v16i8),
23400 return DAG.getBitcast(VT, Res);
23403 static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
23404 SelectionDAG &DAG) {
23405 if (Subtarget.hasXOP())
23406 return LowerBITREVERSE_XOP(Op, DAG);
23408 assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
23410 MVT VT = Op.getSimpleValueType();
23411 SDValue In = Op.getOperand(0);
23414 unsigned NumElts = VT.getVectorNumElements();
23415 assert(VT.getScalarType() == MVT::i8 &&
23416 "Only byte vector BITREVERSE supported");
23418 // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
23419 if (VT.is256BitVector() && !Subtarget.hasInt256())
23420 return Lower256IntUnary(Op, DAG);
23422 // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
23423 // two nibbles and a PSHUFB lookup to find the bitreverse of each
23424 // 0-15 value (moved to the other nibble).
23425 SDValue NibbleMask = DAG.getConstant(0xF, DL, VT);
23426 SDValue Lo = DAG.getNode(ISD::AND, DL, VT, In, NibbleMask);
23427 SDValue Hi = DAG.getNode(ISD::SRL, DL, VT, In, DAG.getConstant(4, DL, VT));
23429 const int LoLUT[16] = {
23430 /* 0 */ 0x00, /* 1 */ 0x80, /* 2 */ 0x40, /* 3 */ 0xC0,
23431 /* 4 */ 0x20, /* 5 */ 0xA0, /* 6 */ 0x60, /* 7 */ 0xE0,
23432 /* 8 */ 0x10, /* 9 */ 0x90, /* a */ 0x50, /* b */ 0xD0,
23433 /* c */ 0x30, /* d */ 0xB0, /* e */ 0x70, /* f */ 0xF0};
23434 const int HiLUT[16] = {
23435 /* 0 */ 0x00, /* 1 */ 0x08, /* 2 */ 0x04, /* 3 */ 0x0C,
23436 /* 4 */ 0x02, /* 5 */ 0x0A, /* 6 */ 0x06, /* 7 */ 0x0E,
23437 /* 8 */ 0x01, /* 9 */ 0x09, /* a */ 0x05, /* b */ 0x0D,
23438 /* c */ 0x03, /* d */ 0x0B, /* e */ 0x07, /* f */ 0x0F};
23440 SmallVector<SDValue, 16> LoMaskElts, HiMaskElts;
23441 for (unsigned i = 0; i < NumElts; ++i) {
23442 LoMaskElts.push_back(DAG.getConstant(LoLUT[i % 16], DL, MVT::i8));
23443 HiMaskElts.push_back(DAG.getConstant(HiLUT[i % 16], DL, MVT::i8));
23446 SDValue LoMask = DAG.getBuildVector(VT, DL, LoMaskElts);
23447 SDValue HiMask = DAG.getBuildVector(VT, DL, HiMaskElts);
23448 Lo = DAG.getNode(X86ISD::PSHUFB, DL, VT, LoMask, Lo);
23449 Hi = DAG.getNode(X86ISD::PSHUFB, DL, VT, HiMask, Hi);
23450 return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
23453 static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
23454 unsigned NewOpc = 0;
23455 switch (N->getOpcode()) {
23456 case ISD::ATOMIC_LOAD_ADD:
23457 NewOpc = X86ISD::LADD;
23459 case ISD::ATOMIC_LOAD_SUB:
23460 NewOpc = X86ISD::LSUB;
23462 case ISD::ATOMIC_LOAD_OR:
23463 NewOpc = X86ISD::LOR;
23465 case ISD::ATOMIC_LOAD_XOR:
23466 NewOpc = X86ISD::LXOR;
23468 case ISD::ATOMIC_LOAD_AND:
23469 NewOpc = X86ISD::LAND;
23472 llvm_unreachable("Unknown ATOMIC_LOAD_ opcode");
23475 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
23476 return DAG.getMemIntrinsicNode(
23477 NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
23478 {N->getOperand(0), N->getOperand(1), N->getOperand(2)},
23479 /*MemVT=*/N->getSimpleValueType(0), MMO);
23482 /// Lower atomic_load_ops into LOCK-prefixed operations.
23483 static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
23484 const X86Subtarget &Subtarget) {
23485 SDValue Chain = N->getOperand(0);
23486 SDValue LHS = N->getOperand(1);
23487 SDValue RHS = N->getOperand(2);
23488 unsigned Opc = N->getOpcode();
23489 MVT VT = N->getSimpleValueType(0);
23492 // We can lower atomic_load_add into LXADD. However, any other atomicrmw op
23493 // can only be lowered when the result is unused. They should have already
23494 // been transformed into a cmpxchg loop in AtomicExpand.
23495 if (N->hasAnyUseOfValue(0)) {
23496 // Handle (atomic_load_sub p, v) as (atomic_load_add p, -v), to be able to
23497 // select LXADD if LOCK_SUB can't be selected.
23498 if (Opc == ISD::ATOMIC_LOAD_SUB) {
23499 AtomicSDNode *AN = cast<AtomicSDNode>(N.getNode());
23500 RHS = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), RHS);
23501 return DAG.getAtomic(ISD::ATOMIC_LOAD_ADD, DL, VT, Chain, LHS,
23502 RHS, AN->getMemOperand());
23504 assert(Opc == ISD::ATOMIC_LOAD_ADD &&
23505 "Used AtomicRMW ops other than Add should have been expanded!");
23509 SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
23510 // RAUW the chain, but don't worry about the result, as it's unused.
23511 assert(!N->hasAnyUseOfValue(0));
23512 DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
23516 static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) {
23517 SDNode *Node = Op.getNode();
23519 EVT VT = cast<AtomicSDNode>(Node)->getMemoryVT();
23521 // Convert seq_cst store -> xchg
23522 // Convert wide store -> swap (-> cmpxchg8b/cmpxchg16b)
23523 // FIXME: On 32-bit, store -> fist or movq would be more efficient
23524 // (The only way to get a 16-byte store is cmpxchg16b)
23525 // FIXME: 16-byte ATOMIC_SWAP isn't actually hooked up at the moment.
23526 if (cast<AtomicSDNode>(Node)->getOrdering() ==
23527 AtomicOrdering::SequentiallyConsistent ||
23528 !DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
23529 SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
23530 cast<AtomicSDNode>(Node)->getMemoryVT(),
23531 Node->getOperand(0),
23532 Node->getOperand(1), Node->getOperand(2),
23533 cast<AtomicSDNode>(Node)->getMemOperand());
23534 return Swap.getValue(1);
23536 // Other atomic stores have a simple pattern.
23540 static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
23541 SDNode *N = Op.getNode();
23542 MVT VT = N->getSimpleValueType(0);
23544 // Let legalize expand this if it isn't a legal type yet.
23545 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
23548 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
23551 // Set the carry flag.
23552 SDValue Carry = Op.getOperand(2);
23553 EVT CarryVT = Carry.getValueType();
23554 APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
23555 Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
23556 Carry, DAG.getConstant(NegOne, DL, CarryVT));
23558 unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
23559 SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
23560 Op.getOperand(1), Carry.getValue(1));
23562 SDValue SetCC = getSETCC(X86::COND_B, Sum.getValue(1), DL, DAG);
23563 if (N->getValueType(1) == MVT::i1)
23564 SetCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
23566 return DAG.getNode(ISD::MERGE_VALUES, DL, N->getVTList(), Sum, SetCC);
23569 static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
23570 SelectionDAG &DAG) {
23571 assert(Subtarget.isTargetDarwin() && Subtarget.is64Bit());
23573 // For MacOSX, we want to call an alternative entry point: __sincos_stret,
23574 // which returns the values as { float, float } (in XMM0) or
23575 // { double, double } (which is returned in XMM0, XMM1).
23577 SDValue Arg = Op.getOperand(0);
23578 EVT ArgVT = Arg.getValueType();
23579 Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
23581 TargetLowering::ArgListTy Args;
23582 TargetLowering::ArgListEntry Entry;
23586 Entry.IsSExt = false;
23587 Entry.IsZExt = false;
23588 Args.push_back(Entry);
23590 bool isF64 = ArgVT == MVT::f64;
23591 // Only optimize x86_64 for now. i386 is a bit messy. For f32,
23592 // the small struct {f32, f32} is returned in (eax, edx). For f64,
23593 // the results are returned via SRet in memory.
23594 const char *LibcallName = isF64 ? "__sincos_stret" : "__sincosf_stret";
23595 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
23597 DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
23599 Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
23600 : (Type *)VectorType::get(ArgTy, 4);
23602 TargetLowering::CallLoweringInfo CLI(DAG);
23603 CLI.setDebugLoc(dl)
23604 .setChain(DAG.getEntryNode())
23605 .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
23607 std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
23610 // Returned in xmm0 and xmm1.
23611 return CallResult.first;
23613 // Returned in bits 0:31 and 32:64 xmm0.
23614 SDValue SinVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23615 CallResult.first, DAG.getIntPtrConstant(0, dl));
23616 SDValue CosVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ArgVT,
23617 CallResult.first, DAG.getIntPtrConstant(1, dl));
23618 SDVTList Tys = DAG.getVTList(ArgVT, ArgVT);
23619 return DAG.getNode(ISD::MERGE_VALUES, dl, Tys, SinVal, CosVal);
23622 /// Widen a vector input to a vector of NVT. The
23623 /// input vector must have the same element type as NVT.
23624 static SDValue ExtendToType(SDValue InOp, MVT NVT, SelectionDAG &DAG,
23625 bool FillWithZeroes = false) {
23626 // Check if InOp already has the right width.
23627 MVT InVT = InOp.getSimpleValueType();
23631 if (InOp.isUndef())
23632 return DAG.getUNDEF(NVT);
23634 assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
23635 "input and widen element type must match");
23637 unsigned InNumElts = InVT.getVectorNumElements();
23638 unsigned WidenNumElts = NVT.getVectorNumElements();
23639 assert(WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0 &&
23640 "Unexpected request for vector widening");
23643 if (InOp.getOpcode() == ISD::CONCAT_VECTORS &&
23644 InOp.getNumOperands() == 2) {
23645 SDValue N1 = InOp.getOperand(1);
23646 if ((ISD::isBuildVectorAllZeros(N1.getNode()) && FillWithZeroes) ||
23648 InOp = InOp.getOperand(0);
23649 InVT = InOp.getSimpleValueType();
23650 InNumElts = InVT.getVectorNumElements();
23653 if (ISD::isBuildVectorOfConstantSDNodes(InOp.getNode()) ||
23654 ISD::isBuildVectorOfConstantFPSDNodes(InOp.getNode())) {
23655 SmallVector<SDValue, 16> Ops;
23656 for (unsigned i = 0; i < InNumElts; ++i)
23657 Ops.push_back(InOp.getOperand(i));
23659 EVT EltVT = InOp.getOperand(0).getValueType();
23661 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
23662 DAG.getUNDEF(EltVT);
23663 for (unsigned i = 0; i < WidenNumElts - InNumElts; ++i)
23664 Ops.push_back(FillVal);
23665 return DAG.getBuildVector(NVT, dl, Ops);
23667 SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, NVT) :
23669 return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NVT, FillVal,
23670 InOp, DAG.getIntPtrConstant(0, dl));
23673 static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
23674 SelectionDAG &DAG) {
23675 assert(Subtarget.hasAVX512() &&
23676 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23678 // X86 scatter kills mask register, so its type should be added to
23679 // the list of return values.
23680 // If the "scatter" has 2 return values, it is already handled.
23681 if (Op.getNode()->getNumValues() == 2)
23684 MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
23685 SDValue Src = N->getValue();
23686 MVT VT = Src.getSimpleValueType();
23687 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
23690 SDValue NewScatter;
23691 SDValue Index = N->getIndex();
23692 SDValue Mask = N->getMask();
23693 SDValue Chain = N->getChain();
23694 SDValue BasePtr = N->getBasePtr();
23695 MVT MemVT = N->getMemoryVT().getSimpleVT();
23696 MVT IndexVT = Index.getSimpleValueType();
23697 MVT MaskVT = Mask.getSimpleValueType();
23699 if (MemVT.getScalarSizeInBits() < VT.getScalarSizeInBits()) {
23700 // The v2i32 value was promoted to v2i64.
23701 // Now we "redo" the type legalizer's work and widen the original
23702 // v2i32 value to v4i32. The original v2i32 is retrieved from v2i64
23704 assert((MemVT == MVT::v2i32 && VT == MVT::v2i64) &&
23705 "Unexpected memory type");
23706 int ShuffleMask[] = {0, 2, -1, -1};
23707 Src = DAG.getVectorShuffle(MVT::v4i32, dl, DAG.getBitcast(MVT::v4i32, Src),
23708 DAG.getUNDEF(MVT::v4i32), ShuffleMask);
23709 // Now we have 4 elements instead of 2.
23710 // Expand the index.
23711 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), 4);
23712 Index = ExtendToType(Index, NewIndexVT, DAG);
23714 // Expand the mask with zeroes
23715 // Mask may be <2 x i64> or <2 x i1> at this moment
23716 assert((MaskVT == MVT::v2i1 || MaskVT == MVT::v2i64) &&
23717 "Unexpected mask type");
23718 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), 4);
23719 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23723 unsigned NumElts = VT.getVectorNumElements();
23724 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23725 !Index.getSimpleValueType().is512BitVector()) {
23726 // AVX512F supports only 512-bit vectors. Or data or index should
23727 // be 512 bit wide. If now the both index and data are 256-bit, but
23728 // the vector contains 8 elements, we just sign-extend the index
23729 if (IndexVT == MVT::v8i32)
23730 // Just extend index
23731 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23733 // The minimal number of elts in scatter is 8
23736 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23737 // Use original index here, do not modify the index twice
23738 Index = ExtendToType(N->getIndex(), NewIndexVT, DAG);
23739 if (IndexVT.getScalarType() == MVT::i32)
23740 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23743 // At this point we have promoted mask operand
23744 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23745 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23746 // Use the original mask here, do not modify the mask twice
23747 Mask = ExtendToType(N->getMask(), ExtMaskVT, DAG, true);
23749 // The value that should be stored
23750 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23751 Src = ExtendToType(Src, NewVT, DAG);
23754 // If the mask is "wide" at this point - truncate it to i1 vector
23755 MVT BitMaskVT = MVT::getVectorVT(MVT::i1, NumElts);
23756 Mask = DAG.getNode(ISD::TRUNCATE, dl, BitMaskVT, Mask);
23758 // The mask is killed by scatter, add it to the values
23759 SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
23760 SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
23761 NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
23762 N->getMemOperand());
23763 DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
23764 return SDValue(NewScatter.getNode(), 1);
23767 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
23768 SelectionDAG &DAG) {
23770 MaskedLoadSDNode *N = cast<MaskedLoadSDNode>(Op.getNode());
23771 MVT VT = Op.getSimpleValueType();
23772 MVT ScalarVT = VT.getScalarType();
23773 SDValue Mask = N->getMask();
23776 assert((!N->isExpandingLoad() || Subtarget.hasAVX512()) &&
23777 "Expanding masked load is supported on AVX-512 target only!");
23779 assert((!N->isExpandingLoad() || ScalarVT.getSizeInBits() >= 32) &&
23780 "Expanding masked load is supported for 32 and 64-bit types only!");
23782 // 4x32, 4x64 and 2x64 vectors of non-expanding loads are legal regardless of
23783 // VLX. These types for exp-loads are handled here.
23784 if (!N->isExpandingLoad() && VT.getVectorNumElements() <= 4)
23787 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23788 "Cannot lower masked load op.");
23790 assert((ScalarVT.getSizeInBits() >= 32 ||
23791 (Subtarget.hasBWI() &&
23792 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23793 "Unsupported masked load op.");
23795 // This operation is legal for targets with VLX, but without
23796 // VLX the vector should be widened to 512 bit
23797 unsigned NumEltsInWideVec = 512 / VT.getScalarSizeInBits();
23798 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23799 SDValue Src0 = N->getSrc0();
23800 Src0 = ExtendToType(Src0, WideDataVT, DAG);
23802 // Mask element has to be i1.
23803 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23804 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23805 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23807 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23809 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23810 if (MaskEltTy != MVT::i1)
23811 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23812 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23813 SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
23814 N->getBasePtr(), Mask, Src0,
23815 N->getMemoryVT(), N->getMemOperand(),
23816 N->getExtensionType(),
23817 N->isExpandingLoad());
23819 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23820 NewLoad.getValue(0),
23821 DAG.getIntPtrConstant(0, dl));
23822 SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
23823 return DAG.getMergeValues(RetOps, dl);
23826 static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
23827 SelectionDAG &DAG) {
23828 MaskedStoreSDNode *N = cast<MaskedStoreSDNode>(Op.getNode());
23829 SDValue DataToStore = N->getValue();
23830 MVT VT = DataToStore.getSimpleValueType();
23831 MVT ScalarVT = VT.getScalarType();
23832 SDValue Mask = N->getMask();
23835 assert((!N->isCompressingStore() || Subtarget.hasAVX512()) &&
23836 "Expanding masked load is supported on AVX-512 target only!");
23838 assert((!N->isCompressingStore() || ScalarVT.getSizeInBits() >= 32) &&
23839 "Expanding masked load is supported for 32 and 64-bit types only!");
23841 // 4x32 and 2x64 vectors of non-compressing stores are legal regardless to VLX.
23842 if (!N->isCompressingStore() && VT.getVectorNumElements() <= 4)
23845 assert(Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
23846 "Cannot lower masked store op.");
23848 assert((ScalarVT.getSizeInBits() >= 32 ||
23849 (Subtarget.hasBWI() &&
23850 (ScalarVT == MVT::i8 || ScalarVT == MVT::i16))) &&
23851 "Unsupported masked store op.");
23853 // This operation is legal for targets with VLX, but without
23854 // VLX the vector should be widened to 512 bit
23855 unsigned NumEltsInWideVec = 512/VT.getScalarSizeInBits();
23856 MVT WideDataVT = MVT::getVectorVT(ScalarVT, NumEltsInWideVec);
23858 // Mask element has to be i1.
23859 MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
23860 assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
23861 "We handle 4x32, 4x64 and 2x64 vectors only in this case");
23863 MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
23865 DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
23866 Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
23867 if (MaskEltTy != MVT::i1)
23868 Mask = DAG.getNode(ISD::TRUNCATE, dl,
23869 MVT::getVectorVT(MVT::i1, NumEltsInWideVec), Mask);
23870 return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
23871 Mask, N->getMemoryVT(), N->getMemOperand(),
23872 N->isTruncatingStore(), N->isCompressingStore());
23875 static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
23876 SelectionDAG &DAG) {
23877 assert(Subtarget.hasAVX512() &&
23878 "MGATHER/MSCATTER are supported on AVX-512 arch only");
23880 MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
23882 MVT VT = Op.getSimpleValueType();
23883 SDValue Index = N->getIndex();
23884 SDValue Mask = N->getMask();
23885 SDValue Src0 = N->getValue();
23886 MVT IndexVT = Index.getSimpleValueType();
23887 MVT MaskVT = Mask.getSimpleValueType();
23889 unsigned NumElts = VT.getVectorNumElements();
23890 assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
23892 if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
23893 !Index.getSimpleValueType().is512BitVector()) {
23894 // AVX512F supports only 512-bit vectors. Or data or index should
23895 // be 512 bit wide. If now the both index and data are 256-bit, but
23896 // the vector contains 8 elements, we just sign-extend the index
23897 if (NumElts == 8) {
23898 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23899 SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
23900 N->getOperand(3), Index };
23901 DAG.UpdateNodeOperands(N, Ops);
23905 // Minimal number of elements in Gather
23908 MVT NewIndexVT = MVT::getVectorVT(IndexVT.getScalarType(), NumElts);
23909 Index = ExtendToType(Index, NewIndexVT, DAG);
23910 if (IndexVT.getScalarType() == MVT::i32)
23911 Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
23914 MVT MaskBitVT = MVT::getVectorVT(MVT::i1, NumElts);
23915 // At this point we have promoted mask operand
23916 assert(MaskVT.getScalarSizeInBits() >= 32 && "unexpected mask type");
23917 MVT ExtMaskVT = MVT::getVectorVT(MaskVT.getScalarType(), NumElts);
23918 Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
23919 Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
23921 // The pass-through value
23922 MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
23923 Src0 = ExtendToType(Src0, NewVT, DAG);
23925 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23926 SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
23927 N->getMemoryVT(), dl, Ops,
23928 N->getMemOperand());
23929 SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
23930 NewGather.getValue(0),
23931 DAG.getIntPtrConstant(0, dl));
23932 SDValue RetOps[] = {Exract, NewGather.getValue(1)};
23933 return DAG.getMergeValues(RetOps, dl);
23935 if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
23936 // There is a special case when the return type is v2i32 is illegal and
23937 // the type legaizer extended it to v2i64. Without this conversion we end up
23938 // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
23939 // In order to avoid this situation, we'll build an X86 specific Gather node
23940 // with index v2i64 and value type v4i32.
23941 assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
23942 "Unexpected type in masked gather");
23943 Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
23944 DAG.getBitcast(MVT::v4i32, Src0),
23945 DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
23946 // The mask should match the destination type. Extending mask with zeroes
23947 // is not necessary since instruction itself reads only two values from
23949 Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
23950 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23951 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23952 DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
23953 N->getMemOperand());
23955 SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
23956 NewGather.getValue(0), DAG);
23957 SDValue RetOps[] = { Sext, NewGather.getValue(1) };
23958 return DAG.getMergeValues(RetOps, dl);
23960 if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
23961 // This transformation is for optimization only.
23962 // The type legalizer extended mask and index to 4 elements vector
23963 // in order to match requirements of the common gather node - same
23964 // vector width of index and value. X86 Gather node allows mismatch
23965 // of vector width in order to select more optimal instruction at the
23967 assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
23968 "Unexpected type in masked gather");
23969 if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
23970 ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
23971 Index.getOpcode() == ISD::CONCAT_VECTORS &&
23972 Index.getOperand(1).isUndef()) {
23973 Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
23974 Index = Index.getOperand(0);
23977 SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
23978 SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
23979 DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
23980 N->getMemOperand());
23982 SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
23983 return DAG.getMergeValues(RetOps, dl);
23989 SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
23990 SelectionDAG &DAG) const {
23991 // TODO: Eventually, the lowering of these nodes should be informed by or
23992 // deferred to the GC strategy for the function in which they appear. For
23993 // now, however, they must be lowered to something. Since they are logically
23994 // no-ops in the case of a null GC strategy (or a GC strategy which does not
23995 // require special handling for these nodes), lower them as literal NOOPs for
23997 SmallVector<SDValue, 2> Ops;
23999 Ops.push_back(Op.getOperand(0));
24000 if (Op->getGluedNode())
24001 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24004 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24005 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24010 SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
24011 SelectionDAG &DAG) const {
24012 // TODO: Eventually, the lowering of these nodes should be informed by or
24013 // deferred to the GC strategy for the function in which they appear. For
24014 // now, however, they must be lowered to something. Since they are logically
24015 // no-ops in the case of a null GC strategy (or a GC strategy which does not
24016 // require special handling for these nodes), lower them as literal NOOPs for
24018 SmallVector<SDValue, 2> Ops;
24020 Ops.push_back(Op.getOperand(0));
24021 if (Op->getGluedNode())
24022 Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
24025 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
24026 SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
24031 /// Provide custom lowering hooks for some operations.
24032 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
24033 switch (Op.getOpcode()) {
24034 default: llvm_unreachable("Should not custom lower this!");
24035 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, Subtarget, DAG);
24036 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
24037 return LowerCMP_SWAP(Op, Subtarget, DAG);
24038 case ISD::CTPOP: return LowerCTPOP(Op, Subtarget, DAG);
24039 case ISD::ATOMIC_LOAD_ADD:
24040 case ISD::ATOMIC_LOAD_SUB:
24041 case ISD::ATOMIC_LOAD_OR:
24042 case ISD::ATOMIC_LOAD_XOR:
24043 case ISD::ATOMIC_LOAD_AND: return lowerAtomicArith(Op, DAG, Subtarget);
24044 case ISD::ATOMIC_STORE: return LowerATOMIC_STORE(Op, DAG);
24045 case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
24046 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
24047 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
24048 case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
24049 case ISD::VSELECT: return LowerVSELECT(Op, DAG);
24050 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
24051 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
24052 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
24053 case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
24054 case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
24055 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
24056 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
24057 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
24058 case ISD::ExternalSymbol: return LowerExternalSymbol(Op, DAG);
24059 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
24060 case ISD::SHL_PARTS:
24061 case ISD::SRA_PARTS:
24062 case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
24063 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
24064 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
24065 case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
24066 case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
24067 case ISD::SIGN_EXTEND: return LowerSIGN_EXTEND(Op, Subtarget, DAG);
24068 case ISD::ANY_EXTEND: return LowerANY_EXTEND(Op, Subtarget, DAG);
24069 case ISD::ZERO_EXTEND_VECTOR_INREG:
24070 case ISD::SIGN_EXTEND_VECTOR_INREG:
24071 return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
24072 case ISD::FP_TO_SINT:
24073 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
24074 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
24075 case ISD::LOAD: return LowerExtendedLoad(Op, Subtarget, DAG);
24077 case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
24078 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
24079 case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
24080 case ISD::SETCC: return LowerSETCC(Op, DAG);
24081 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
24082 case ISD::SELECT: return LowerSELECT(Op, DAG);
24083 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
24084 case ISD::JumpTable: return LowerJumpTable(Op, DAG);
24085 case ISD::VASTART: return LowerVASTART(Op, DAG);
24086 case ISD::VAARG: return LowerVAARG(Op, DAG);
24087 case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
24088 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
24089 case ISD::INTRINSIC_VOID:
24090 case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
24091 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
24092 case ISD::ADDROFRETURNADDR: return LowerADDROFRETURNADDR(Op, DAG);
24093 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
24094 case ISD::FRAME_TO_ARGS_OFFSET:
24095 return LowerFRAME_TO_ARGS_OFFSET(Op, DAG);
24096 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
24097 case ISD::EH_RETURN: return LowerEH_RETURN(Op, DAG);
24098 case ISD::EH_SJLJ_SETJMP: return lowerEH_SJLJ_SETJMP(Op, DAG);
24099 case ISD::EH_SJLJ_LONGJMP: return lowerEH_SJLJ_LONGJMP(Op, DAG);
24100 case ISD::EH_SJLJ_SETUP_DISPATCH:
24101 return lowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
24102 case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
24103 case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
24104 case ISD::FLT_ROUNDS_: return LowerFLT_ROUNDS_(Op, DAG);
24106 case ISD::CTLZ_ZERO_UNDEF: return LowerCTLZ(Op, Subtarget, DAG);
24108 case ISD::CTTZ_ZERO_UNDEF: return LowerCTTZ(Op, DAG);
24109 case ISD::MUL: return LowerMUL(Op, Subtarget, DAG);
24111 case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG);
24112 case ISD::UMUL_LOHI:
24113 case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG);
24115 case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG);
24118 case ISD::SHL: return LowerShift(Op, Subtarget, DAG);
24124 case ISD::UMULO: return LowerXALUO(Op, DAG);
24125 case ISD::READCYCLECOUNTER: return LowerREADCYCLECOUNTER(Op, Subtarget,DAG);
24126 case ISD::BITCAST: return LowerBITCAST(Op, Subtarget, DAG);
24127 case ISD::ADDCARRY:
24128 case ISD::SUBCARRY: return LowerADDSUBCARRY(Op, DAG);
24130 case ISD::SUB: return LowerADD_SUB(Op, DAG);
24134 case ISD::UMIN: return LowerMINMAX(Op, DAG);
24135 case ISD::ABS: return LowerABS(Op, DAG);
24136 case ISD::FSINCOS: return LowerFSINCOS(Op, Subtarget, DAG);
24137 case ISD::MLOAD: return LowerMLOAD(Op, Subtarget, DAG);
24138 case ISD::MSTORE: return LowerMSTORE(Op, Subtarget, DAG);
24139 case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
24140 case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
24141 case ISD::GC_TRANSITION_START:
24142 return LowerGC_TRANSITION_START(Op, DAG);
24143 case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
24144 case ISD::STORE: return LowerTruncatingStore(Op, Subtarget, DAG);
24148 /// Places new result values for the node in Results (their number
24149 /// and types must exactly match those of the original return values of
24150 /// the node), or leaves Results empty, which indicates that the node is not
24151 /// to be custom lowered after all.
24152 void X86TargetLowering::LowerOperationWrapper(SDNode *N,
24153 SmallVectorImpl<SDValue> &Results,
24154 SelectionDAG &DAG) const {
24155 SDValue Res = LowerOperation(SDValue(N, 0), DAG);
24157 if (!Res.getNode())
24160 assert((N->getNumValues() <= Res->getNumValues()) &&
24161 "Lowering returned the wrong number of results!");
24163 // Places new result values base on N result number.
24164 // In some cases (LowerSINT_TO_FP for example) Res has more result values
24165 // than original node, chain should be dropped(last value).
24166 for (unsigned I = 0, E = N->getNumValues(); I != E; ++I)
24167 Results.push_back(Res.getValue(I));
24170 /// Replace a node with an illegal result type with a new node built out of
24172 void X86TargetLowering::ReplaceNodeResults(SDNode *N,
24173 SmallVectorImpl<SDValue>&Results,
24174 SelectionDAG &DAG) const {
24176 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
24177 switch (N->getOpcode()) {
24179 llvm_unreachable("Do not know how to custom type legalize this operation!");
24180 case X86ISD::AVG: {
24181 // Legalize types for X86ISD::AVG by expanding vectors.
24182 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24184 auto InVT = N->getValueType(0);
24185 auto InVTSize = InVT.getSizeInBits();
24186 const unsigned RegSize =
24187 (InVTSize > 128) ? ((InVTSize > 256) ? 512 : 256) : 128;
24188 assert((Subtarget.hasBWI() || RegSize < 512) &&
24189 "512-bit vector requires AVX512BW");
24190 assert((Subtarget.hasAVX2() || RegSize < 256) &&
24191 "256-bit vector requires AVX2");
24193 auto ElemVT = InVT.getVectorElementType();
24194 auto RegVT = EVT::getVectorVT(*DAG.getContext(), ElemVT,
24195 RegSize / ElemVT.getSizeInBits());
24196 assert(RegSize % InVT.getSizeInBits() == 0);
24197 unsigned NumConcat = RegSize / InVT.getSizeInBits();
24199 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getUNDEF(InVT));
24200 Ops[0] = N->getOperand(0);
24201 SDValue InVec0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24202 Ops[0] = N->getOperand(1);
24203 SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
24205 SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
24206 Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
24207 DAG.getIntPtrConstant(0, dl)));
24210 // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
24211 case X86ISD::FMINC:
24213 case X86ISD::FMAXC:
24214 case X86ISD::FMAX: {
24215 EVT VT = N->getValueType(0);
24216 assert(VT == MVT::v2f32 && "Unexpected type (!= v2f32) on FMIN/FMAX.");
24217 SDValue UNDEF = DAG.getUNDEF(VT);
24218 SDValue LHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24219 N->getOperand(0), UNDEF);
24220 SDValue RHS = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
24221 N->getOperand(1), UNDEF);
24222 Results.push_back(DAG.getNode(N->getOpcode(), dl, MVT::v4f32, LHS, RHS));
24230 case ISD::UDIVREM: {
24231 SDValue V = LowerWin64_i128OP(SDValue(N,0), DAG);
24232 Results.push_back(V);
24235 case ISD::FP_TO_SINT:
24236 case ISD::FP_TO_UINT: {
24237 bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
24239 if (N->getValueType(0) == MVT::v2i32) {
24240 assert((IsSigned || Subtarget.hasAVX512()) &&
24241 "Can only handle signed conversion without AVX512");
24242 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24243 SDValue Src = N->getOperand(0);
24244 if (Src.getValueType() == MVT::v2f64) {
24245 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24246 SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
24247 : X86ISD::CVTTP2UI,
24248 dl, MVT::v4i32, Src);
24249 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24250 Results.push_back(Res);
24253 if (Src.getValueType() == MVT::v2f32) {
24254 SDValue Idx = DAG.getIntPtrConstant(0, dl);
24255 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
24256 DAG.getUNDEF(MVT::v2f32));
24257 Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
24258 : ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
24259 Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
24260 Results.push_back(Res);
24264 // The FP_TO_INTHelper below only handles f32/f64/f80 scalar inputs,
24265 // so early out here.
24269 std::pair<SDValue,SDValue> Vals =
24270 FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, /*IsReplace=*/ true);
24271 SDValue FIST = Vals.first, StackSlot = Vals.second;
24272 if (FIST.getNode()) {
24273 EVT VT = N->getValueType(0);
24274 // Return a load from the stack slot.
24275 if (StackSlot.getNode())
24277 DAG.getLoad(VT, dl, FIST, StackSlot, MachinePointerInfo()));
24279 Results.push_back(FIST);
24283 case ISD::SINT_TO_FP: {
24284 assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
24285 SDValue Src = N->getOperand(0);
24286 if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
24288 Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
24291 case ISD::UINT_TO_FP: {
24292 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24293 EVT VT = N->getValueType(0);
24294 if (VT != MVT::v2f32)
24296 SDValue Src = N->getOperand(0);
24297 EVT SrcVT = Src.getValueType();
24298 if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
24299 Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
24302 if (SrcVT != MVT::v2i32)
24304 SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
24306 DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
24307 SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
24308 DAG.getBitcast(MVT::v2i64, VBias));
24309 Or = DAG.getBitcast(MVT::v2f64, Or);
24310 // TODO: Are there any fast-math-flags to propagate here?
24311 SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
24312 Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
24315 case ISD::FP_ROUND: {
24316 if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
24318 SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
24319 Results.push_back(V);
24322 case ISD::FP_EXTEND: {
24323 // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
24324 // No other ValueType for FP_EXTEND should reach this point.
24325 assert(N->getValueType(0) == MVT::v2f32 &&
24326 "Do not know how to legalize this Node");
24329 case ISD::INTRINSIC_W_CHAIN: {
24330 unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
24332 default : llvm_unreachable("Do not know how to custom type "
24333 "legalize this intrinsic operation!");
24334 case Intrinsic::x86_rdtsc:
24335 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24337 case Intrinsic::x86_rdtscp:
24338 return getReadTimeStampCounter(N, dl, X86ISD::RDTSCP_DAG, DAG, Subtarget,
24340 case Intrinsic::x86_rdpmc:
24341 return getReadPerformanceCounter(N, dl, DAG, Subtarget, Results);
24343 case Intrinsic::x86_xgetbv:
24344 return getExtendedControlRegister(N, dl, DAG, Subtarget, Results);
24347 case ISD::INTRINSIC_WO_CHAIN: {
24348 if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
24349 Results.push_back(V);
24352 case ISD::READCYCLECOUNTER: {
24353 return getReadTimeStampCounter(N, dl, X86ISD::RDTSC_DAG, DAG, Subtarget,
24356 case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
24357 EVT T = N->getValueType(0);
24358 assert((T == MVT::i64 || T == MVT::i128) && "can only expand cmpxchg pair");
24359 bool Regs64bit = T == MVT::i128;
24360 MVT HalfT = Regs64bit ? MVT::i64 : MVT::i32;
24361 SDValue cpInL, cpInH;
24362 cpInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24363 DAG.getConstant(0, dl, HalfT));
24364 cpInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(2),
24365 DAG.getConstant(1, dl, HalfT));
24366 cpInL = DAG.getCopyToReg(N->getOperand(0), dl,
24367 Regs64bit ? X86::RAX : X86::EAX,
24369 cpInH = DAG.getCopyToReg(cpInL.getValue(0), dl,
24370 Regs64bit ? X86::RDX : X86::EDX,
24371 cpInH, cpInL.getValue(1));
24372 SDValue swapInL, swapInH;
24373 swapInL = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24374 DAG.getConstant(0, dl, HalfT));
24375 swapInH = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, HalfT, N->getOperand(3),
24376 DAG.getConstant(1, dl, HalfT));
24378 DAG.getCopyToReg(cpInH.getValue(0), dl, Regs64bit ? X86::RCX : X86::ECX,
24379 swapInH, cpInH.getValue(1));
24380 // If the current function needs the base pointer, RBX,
24381 // we shouldn't use cmpxchg directly.
24382 // Indeed the lowering of that instruction will clobber
24383 // that register and since RBX will be a reserved register
24384 // the register allocator will not make sure its value will
24385 // be properly saved and restored around this live-range.
24386 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
24388 SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue);
24389 unsigned BasePtr = TRI->getBaseRegister();
24390 MachineMemOperand *MMO = cast<AtomicSDNode>(N)->getMemOperand();
24391 if (TRI->hasBasePointer(DAG.getMachineFunction()) &&
24392 (BasePtr == X86::RBX || BasePtr == X86::EBX)) {
24393 // ISel prefers the LCMPXCHG64 variant.
24394 // If that assert breaks, that means it is not the case anymore,
24395 // and we need to teach LCMPXCHG8_SAVE_EBX_DAG how to save RBX,
24396 // not just EBX. This is a matter of accepting i64 input for that
24397 // pseudo, and restoring into the register of the right wide
24398 // in expand pseudo. Everything else should just work.
24399 assert(((Regs64bit == (BasePtr == X86::RBX)) || BasePtr == X86::EBX) &&
24400 "Saving only half of the RBX");
24401 unsigned Opcode = Regs64bit ? X86ISD::LCMPXCHG16_SAVE_RBX_DAG
24402 : X86ISD::LCMPXCHG8_SAVE_EBX_DAG;
24403 SDValue RBXSave = DAG.getCopyFromReg(swapInH.getValue(0), dl,
24404 Regs64bit ? X86::RBX : X86::EBX,
24405 HalfT, swapInH.getValue(1));
24406 SDValue Ops[] = {/*Chain*/ RBXSave.getValue(1), N->getOperand(1), swapInL,
24408 /*Glue*/ RBXSave.getValue(2)};
24409 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24412 Regs64bit ? X86ISD::LCMPXCHG16_DAG : X86ISD::LCMPXCHG8_DAG;
24413 swapInL = DAG.getCopyToReg(swapInH.getValue(0), dl,
24414 Regs64bit ? X86::RBX : X86::EBX, swapInL,
24415 swapInH.getValue(1));
24416 SDValue Ops[] = {swapInL.getValue(0), N->getOperand(1),
24417 swapInL.getValue(1)};
24418 Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, T, MMO);
24420 SDValue cpOutL = DAG.getCopyFromReg(Result.getValue(0), dl,
24421 Regs64bit ? X86::RAX : X86::EAX,
24422 HalfT, Result.getValue(1));
24423 SDValue cpOutH = DAG.getCopyFromReg(cpOutL.getValue(1), dl,
24424 Regs64bit ? X86::RDX : X86::EDX,
24425 HalfT, cpOutL.getValue(2));
24426 SDValue OpsF[] = { cpOutL.getValue(0), cpOutH.getValue(0)};
24428 SDValue EFLAGS = DAG.getCopyFromReg(cpOutH.getValue(1), dl, X86::EFLAGS,
24429 MVT::i32, cpOutH.getValue(2));
24430 SDValue Success = getSETCC(X86::COND_E, EFLAGS, dl, DAG);
24431 Success = DAG.getZExtOrTrunc(Success, dl, N->getValueType(1));
24433 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, T, OpsF));
24434 Results.push_back(Success);
24435 Results.push_back(EFLAGS.getValue(1));
24438 case ISD::ATOMIC_SWAP:
24439 case ISD::ATOMIC_LOAD_ADD:
24440 case ISD::ATOMIC_LOAD_SUB:
24441 case ISD::ATOMIC_LOAD_AND:
24442 case ISD::ATOMIC_LOAD_OR:
24443 case ISD::ATOMIC_LOAD_XOR:
24444 case ISD::ATOMIC_LOAD_NAND:
24445 case ISD::ATOMIC_LOAD_MIN:
24446 case ISD::ATOMIC_LOAD_MAX:
24447 case ISD::ATOMIC_LOAD_UMIN:
24448 case ISD::ATOMIC_LOAD_UMAX:
24449 case ISD::ATOMIC_LOAD: {
24450 // Delegate to generic TypeLegalization. Situations we can really handle
24451 // should have already been dealt with by AtomicExpandPass.cpp.
24454 case ISD::BITCAST: {
24455 assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
24456 EVT DstVT = N->getValueType(0);
24457 EVT SrcVT = N->getOperand(0)->getValueType(0);
24459 if (SrcVT != MVT::f64 ||
24460 (DstVT != MVT::v2i32 && DstVT != MVT::v4i16 && DstVT != MVT::v8i8))
24463 unsigned NumElts = DstVT.getVectorNumElements();
24464 EVT SVT = DstVT.getVectorElementType();
24465 EVT WiderVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumElts * 2);
24466 SDValue Expanded = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
24467 MVT::v2f64, N->getOperand(0));
24468 SDValue ToVecInt = DAG.getBitcast(WiderVT, Expanded);
24470 if (ExperimentalVectorWideningLegalization) {
24471 // If we are legalizing vectors by widening, we already have the desired
24472 // legal vector type, just return it.
24473 Results.push_back(ToVecInt);
24477 SmallVector<SDValue, 8> Elts;
24478 for (unsigned i = 0, e = NumElts; i != e; ++i)
24479 Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SVT,
24480 ToVecInt, DAG.getIntPtrConstant(i, dl)));
24482 Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
24487 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
24488 switch ((X86ISD::NodeType)Opcode) {
24489 case X86ISD::FIRST_NUMBER: break;
24490 case X86ISD::BSF: return "X86ISD::BSF";
24491 case X86ISD::BSR: return "X86ISD::BSR";
24492 case X86ISD::SHLD: return "X86ISD::SHLD";
24493 case X86ISD::SHRD: return "X86ISD::SHRD";
24494 case X86ISD::FAND: return "X86ISD::FAND";
24495 case X86ISD::FANDN: return "X86ISD::FANDN";
24496 case X86ISD::FOR: return "X86ISD::FOR";
24497 case X86ISD::FXOR: return "X86ISD::FXOR";
24498 case X86ISD::FILD: return "X86ISD::FILD";
24499 case X86ISD::FILD_FLAG: return "X86ISD::FILD_FLAG";
24500 case X86ISD::FP_TO_INT16_IN_MEM: return "X86ISD::FP_TO_INT16_IN_MEM";
24501 case X86ISD::FP_TO_INT32_IN_MEM: return "X86ISD::FP_TO_INT32_IN_MEM";
24502 case X86ISD::FP_TO_INT64_IN_MEM: return "X86ISD::FP_TO_INT64_IN_MEM";
24503 case X86ISD::FLD: return "X86ISD::FLD";
24504 case X86ISD::FST: return "X86ISD::FST";
24505 case X86ISD::CALL: return "X86ISD::CALL";
24506 case X86ISD::RDTSC_DAG: return "X86ISD::RDTSC_DAG";
24507 case X86ISD::RDTSCP_DAG: return "X86ISD::RDTSCP_DAG";
24508 case X86ISD::RDPMC_DAG: return "X86ISD::RDPMC_DAG";
24509 case X86ISD::BT: return "X86ISD::BT";
24510 case X86ISD::CMP: return "X86ISD::CMP";
24511 case X86ISD::COMI: return "X86ISD::COMI";
24512 case X86ISD::UCOMI: return "X86ISD::UCOMI";
24513 case X86ISD::CMPM: return "X86ISD::CMPM";
24514 case X86ISD::CMPMU: return "X86ISD::CMPMU";
24515 case X86ISD::CMPM_RND: return "X86ISD::CMPM_RND";
24516 case X86ISD::SETCC: return "X86ISD::SETCC";
24517 case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
24518 case X86ISD::FSETCC: return "X86ISD::FSETCC";
24519 case X86ISD::FSETCCM: return "X86ISD::FSETCCM";
24520 case X86ISD::FSETCCM_RND: return "X86ISD::FSETCCM_RND";
24521 case X86ISD::CMOV: return "X86ISD::CMOV";
24522 case X86ISD::BRCOND: return "X86ISD::BRCOND";
24523 case X86ISD::RET_FLAG: return "X86ISD::RET_FLAG";
24524 case X86ISD::IRET: return "X86ISD::IRET";
24525 case X86ISD::REP_STOS: return "X86ISD::REP_STOS";
24526 case X86ISD::REP_MOVS: return "X86ISD::REP_MOVS";
24527 case X86ISD::GlobalBaseReg: return "X86ISD::GlobalBaseReg";
24528 case X86ISD::Wrapper: return "X86ISD::Wrapper";
24529 case X86ISD::WrapperRIP: return "X86ISD::WrapperRIP";
24530 case X86ISD::MOVDQ2Q: return "X86ISD::MOVDQ2Q";
24531 case X86ISD::MMX_MOVD2W: return "X86ISD::MMX_MOVD2W";
24532 case X86ISD::MMX_MOVW2D: return "X86ISD::MMX_MOVW2D";
24533 case X86ISD::PEXTRB: return "X86ISD::PEXTRB";
24534 case X86ISD::PEXTRW: return "X86ISD::PEXTRW";
24535 case X86ISD::INSERTPS: return "X86ISD::INSERTPS";
24536 case X86ISD::PINSRB: return "X86ISD::PINSRB";
24537 case X86ISD::PINSRW: return "X86ISD::PINSRW";
24538 case X86ISD::PSHUFB: return "X86ISD::PSHUFB";
24539 case X86ISD::ANDNP: return "X86ISD::ANDNP";
24540 case X86ISD::BLENDI: return "X86ISD::BLENDI";
24541 case X86ISD::SHRUNKBLEND: return "X86ISD::SHRUNKBLEND";
24542 case X86ISD::ADDUS: return "X86ISD::ADDUS";
24543 case X86ISD::SUBUS: return "X86ISD::SUBUS";
24544 case X86ISD::HADD: return "X86ISD::HADD";
24545 case X86ISD::HSUB: return "X86ISD::HSUB";
24546 case X86ISD::FHADD: return "X86ISD::FHADD";
24547 case X86ISD::FHSUB: return "X86ISD::FHSUB";
24548 case X86ISD::CONFLICT: return "X86ISD::CONFLICT";
24549 case X86ISD::FMAX: return "X86ISD::FMAX";
24550 case X86ISD::FMAXS: return "X86ISD::FMAXS";
24551 case X86ISD::FMAX_RND: return "X86ISD::FMAX_RND";
24552 case X86ISD::FMAXS_RND: return "X86ISD::FMAX_RND";
24553 case X86ISD::FMIN: return "X86ISD::FMIN";
24554 case X86ISD::FMINS: return "X86ISD::FMINS";
24555 case X86ISD::FMIN_RND: return "X86ISD::FMIN_RND";
24556 case X86ISD::FMINS_RND: return "X86ISD::FMINS_RND";
24557 case X86ISD::FMAXC: return "X86ISD::FMAXC";
24558 case X86ISD::FMINC: return "X86ISD::FMINC";
24559 case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
24560 case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
24561 case X86ISD::FRCP: return "X86ISD::FRCP";
24562 case X86ISD::FRCPS: return "X86ISD::FRCPS";
24563 case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
24564 case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
24565 case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
24566 case X86ISD::TLSBASEADDR: return "X86ISD::TLSBASEADDR";
24567 case X86ISD::TLSCALL: return "X86ISD::TLSCALL";
24568 case X86ISD::EH_SJLJ_SETJMP: return "X86ISD::EH_SJLJ_SETJMP";
24569 case X86ISD::EH_SJLJ_LONGJMP: return "X86ISD::EH_SJLJ_LONGJMP";
24570 case X86ISD::EH_SJLJ_SETUP_DISPATCH:
24571 return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
24572 case X86ISD::EH_RETURN: return "X86ISD::EH_RETURN";
24573 case X86ISD::TC_RETURN: return "X86ISD::TC_RETURN";
24574 case X86ISD::FNSTCW16m: return "X86ISD::FNSTCW16m";
24575 case X86ISD::FNSTSW16r: return "X86ISD::FNSTSW16r";
24576 case X86ISD::LCMPXCHG_DAG: return "X86ISD::LCMPXCHG_DAG";
24577 case X86ISD::LCMPXCHG8_DAG: return "X86ISD::LCMPXCHG8_DAG";
24578 case X86ISD::LCMPXCHG16_DAG: return "X86ISD::LCMPXCHG16_DAG";
24579 case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
24580 return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
24581 case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
24582 return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
24583 case X86ISD::LADD: return "X86ISD::LADD";
24584 case X86ISD::LSUB: return "X86ISD::LSUB";
24585 case X86ISD::LOR: return "X86ISD::LOR";
24586 case X86ISD::LXOR: return "X86ISD::LXOR";
24587 case X86ISD::LAND: return "X86ISD::LAND";
24588 case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
24589 case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
24590 case X86ISD::VZEXT: return "X86ISD::VZEXT";
24591 case X86ISD::VSEXT: return "X86ISD::VSEXT";
24592 case X86ISD::VTRUNC: return "X86ISD::VTRUNC";
24593 case X86ISD::VTRUNCS: return "X86ISD::VTRUNCS";
24594 case X86ISD::VTRUNCUS: return "X86ISD::VTRUNCUS";
24595 case X86ISD::VTRUNCSTORES: return "X86ISD::VTRUNCSTORES";
24596 case X86ISD::VTRUNCSTOREUS: return "X86ISD::VTRUNCSTOREUS";
24597 case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
24598 case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
24599 case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
24600 case X86ISD::VFPEXT_RND: return "X86ISD::VFPEXT_RND";
24601 case X86ISD::VFPEXTS_RND: return "X86ISD::VFPEXTS_RND";
24602 case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
24603 case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
24604 case X86ISD::VFPROUNDS_RND: return "X86ISD::VFPROUNDS_RND";
24605 case X86ISD::CVT2MASK: return "X86ISD::CVT2MASK";
24606 case X86ISD::VSHLDQ: return "X86ISD::VSHLDQ";
24607 case X86ISD::VSRLDQ: return "X86ISD::VSRLDQ";
24608 case X86ISD::VSHL: return "X86ISD::VSHL";
24609 case X86ISD::VSRL: return "X86ISD::VSRL";
24610 case X86ISD::VSRA: return "X86ISD::VSRA";
24611 case X86ISD::VSHLI: return "X86ISD::VSHLI";
24612 case X86ISD::VSRLI: return "X86ISD::VSRLI";
24613 case X86ISD::VSRAI: return "X86ISD::VSRAI";
24614 case X86ISD::VSRAV: return "X86ISD::VSRAV";
24615 case X86ISD::VROTLI: return "X86ISD::VROTLI";
24616 case X86ISD::VROTRI: return "X86ISD::VROTRI";
24617 case X86ISD::VPPERM: return "X86ISD::VPPERM";
24618 case X86ISD::CMPP: return "X86ISD::CMPP";
24619 case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
24620 case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
24621 case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
24622 case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
24623 case X86ISD::ADD: return "X86ISD::ADD";
24624 case X86ISD::SUB: return "X86ISD::SUB";
24625 case X86ISD::ADC: return "X86ISD::ADC";
24626 case X86ISD::SBB: return "X86ISD::SBB";
24627 case X86ISD::SMUL: return "X86ISD::SMUL";
24628 case X86ISD::UMUL: return "X86ISD::UMUL";
24629 case X86ISD::SMUL8: return "X86ISD::SMUL8";
24630 case X86ISD::UMUL8: return "X86ISD::UMUL8";
24631 case X86ISD::SDIVREM8_SEXT_HREG: return "X86ISD::SDIVREM8_SEXT_HREG";
24632 case X86ISD::UDIVREM8_ZEXT_HREG: return "X86ISD::UDIVREM8_ZEXT_HREG";
24633 case X86ISD::INC: return "X86ISD::INC";
24634 case X86ISD::DEC: return "X86ISD::DEC";
24635 case X86ISD::OR: return "X86ISD::OR";
24636 case X86ISD::XOR: return "X86ISD::XOR";
24637 case X86ISD::AND: return "X86ISD::AND";
24638 case X86ISD::BEXTR: return "X86ISD::BEXTR";
24639 case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
24640 case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
24641 case X86ISD::PTEST: return "X86ISD::PTEST";
24642 case X86ISD::TESTP: return "X86ISD::TESTP";
24643 case X86ISD::TESTM: return "X86ISD::TESTM";
24644 case X86ISD::TESTNM: return "X86ISD::TESTNM";
24645 case X86ISD::KORTEST: return "X86ISD::KORTEST";
24646 case X86ISD::KTEST: return "X86ISD::KTEST";
24647 case X86ISD::KSHIFTL: return "X86ISD::KSHIFTL";
24648 case X86ISD::KSHIFTR: return "X86ISD::KSHIFTR";
24649 case X86ISD::PACKSS: return "X86ISD::PACKSS";
24650 case X86ISD::PACKUS: return "X86ISD::PACKUS";
24651 case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
24652 case X86ISD::VALIGN: return "X86ISD::VALIGN";
24653 case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
24654 case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
24655 case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
24656 case X86ISD::SHUFP: return "X86ISD::SHUFP";
24657 case X86ISD::SHUF128: return "X86ISD::SHUF128";
24658 case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
24659 case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
24660 case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
24661 case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
24662 case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
24663 case X86ISD::MOVDDUP: return "X86ISD::MOVDDUP";
24664 case X86ISD::MOVSHDUP: return "X86ISD::MOVSHDUP";
24665 case X86ISD::MOVSLDUP: return "X86ISD::MOVSLDUP";
24666 case X86ISD::MOVSD: return "X86ISD::MOVSD";
24667 case X86ISD::MOVSS: return "X86ISD::MOVSS";
24668 case X86ISD::UNPCKL: return "X86ISD::UNPCKL";
24669 case X86ISD::UNPCKH: return "X86ISD::UNPCKH";
24670 case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
24671 case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
24672 case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
24673 case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
24674 case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
24675 case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
24676 case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
24677 case X86ISD::VPERMV: return "X86ISD::VPERMV";
24678 case X86ISD::VPERMV3: return "X86ISD::VPERMV3";
24679 case X86ISD::VPERMIV3: return "X86ISD::VPERMIV3";
24680 case X86ISD::VPERMI: return "X86ISD::VPERMI";
24681 case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
24682 case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
24683 case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
24684 case X86ISD::VRANGE: return "X86ISD::VRANGE";
24685 case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
24686 case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
24687 case X86ISD::PSADBW: return "X86ISD::PSADBW";
24688 case X86ISD::DBPSADBW: return "X86ISD::DBPSADBW";
24689 case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
24690 case X86ISD::VAARG_64: return "X86ISD::VAARG_64";
24691 case X86ISD::WIN_ALLOCA: return "X86ISD::WIN_ALLOCA";
24692 case X86ISD::MEMBARRIER: return "X86ISD::MEMBARRIER";
24693 case X86ISD::MFENCE: return "X86ISD::MFENCE";
24694 case X86ISD::SEG_ALLOCA: return "X86ISD::SEG_ALLOCA";
24695 case X86ISD::SAHF: return "X86ISD::SAHF";
24696 case X86ISD::RDRAND: return "X86ISD::RDRAND";
24697 case X86ISD::RDSEED: return "X86ISD::RDSEED";
24698 case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
24699 case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
24700 case X86ISD::VPROT: return "X86ISD::VPROT";
24701 case X86ISD::VPROTI: return "X86ISD::VPROTI";
24702 case X86ISD::VPSHA: return "X86ISD::VPSHA";
24703 case X86ISD::VPSHL: return "X86ISD::VPSHL";
24704 case X86ISD::VPCOM: return "X86ISD::VPCOM";
24705 case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
24706 case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
24707 case X86ISD::FMADD: return "X86ISD::FMADD";
24708 case X86ISD::FMSUB: return "X86ISD::FMSUB";
24709 case X86ISD::FNMADD: return "X86ISD::FNMADD";
24710 case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
24711 case X86ISD::FMADDSUB: return "X86ISD::FMADDSUB";
24712 case X86ISD::FMSUBADD: return "X86ISD::FMSUBADD";
24713 case X86ISD::FMADD_RND: return "X86ISD::FMADD_RND";
24714 case X86ISD::FNMADD_RND: return "X86ISD::FNMADD_RND";
24715 case X86ISD::FMSUB_RND: return "X86ISD::FMSUB_RND";
24716 case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
24717 case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
24718 case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
24719 case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
24720 case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
24721 case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
24722 case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
24723 case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
24724 case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
24725 case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
24726 case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
24727 case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
24728 case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
24729 case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
24730 case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
24731 case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
24732 case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
24733 case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
24734 case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
24735 case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
24736 case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
24737 case X86ISD::XTEST: return "X86ISD::XTEST";
24738 case X86ISD::COMPRESS: return "X86ISD::COMPRESS";
24739 case X86ISD::EXPAND: return "X86ISD::EXPAND";
24740 case X86ISD::SELECT: return "X86ISD::SELECT";
24741 case X86ISD::SELECTS: return "X86ISD::SELECTS";
24742 case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
24743 case X86ISD::RCP28: return "X86ISD::RCP28";
24744 case X86ISD::RCP28S: return "X86ISD::RCP28S";
24745 case X86ISD::EXP2: return "X86ISD::EXP2";
24746 case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
24747 case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
24748 case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
24749 case X86ISD::FADDS_RND: return "X86ISD::FADDS_RND";
24750 case X86ISD::FSUB_RND: return "X86ISD::FSUB_RND";
24751 case X86ISD::FSUBS_RND: return "X86ISD::FSUBS_RND";
24752 case X86ISD::FMUL_RND: return "X86ISD::FMUL_RND";
24753 case X86ISD::FMULS_RND: return "X86ISD::FMULS_RND";
24754 case X86ISD::FDIV_RND: return "X86ISD::FDIV_RND";
24755 case X86ISD::FDIVS_RND: return "X86ISD::FDIVS_RND";
24756 case X86ISD::FSQRT_RND: return "X86ISD::FSQRT_RND";
24757 case X86ISD::FSQRTS_RND: return "X86ISD::FSQRTS_RND";
24758 case X86ISD::FGETEXP_RND: return "X86ISD::FGETEXP_RND";
24759 case X86ISD::FGETEXPS_RND: return "X86ISD::FGETEXPS_RND";
24760 case X86ISD::SCALEF: return "X86ISD::SCALEF";
24761 case X86ISD::SCALEFS: return "X86ISD::SCALEFS";
24762 case X86ISD::ADDS: return "X86ISD::ADDS";
24763 case X86ISD::SUBS: return "X86ISD::SUBS";
24764 case X86ISD::AVG: return "X86ISD::AVG";
24765 case X86ISD::MULHRS: return "X86ISD::MULHRS";
24766 case X86ISD::SINT_TO_FP_RND: return "X86ISD::SINT_TO_FP_RND";
24767 case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
24768 case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
24769 case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
24770 case X86ISD::CVTTP2SI_RND: return "X86ISD::CVTTP2SI_RND";
24771 case X86ISD::CVTTP2UI_RND: return "X86ISD::CVTTP2UI_RND";
24772 case X86ISD::CVTTS2SI_RND: return "X86ISD::CVTTS2SI_RND";
24773 case X86ISD::CVTTS2UI_RND: return "X86ISD::CVTTS2UI_RND";
24774 case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
24775 case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
24776 case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
24777 case X86ISD::VFPCLASSS: return "X86ISD::VFPCLASSS";
24778 case X86ISD::MULTISHIFT: return "X86ISD::MULTISHIFT";
24779 case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
24780 case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
24781 case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
24782 case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
24783 case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
24784 case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
24785 case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
24786 case X86ISD::CVTP2UI_RND: return "X86ISD::CVTP2UI_RND";
24787 case X86ISD::CVTS2SI_RND: return "X86ISD::CVTS2SI_RND";
24788 case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
24789 case X86ISD::LWPINS: return "X86ISD::LWPINS";
24790 case X86ISD::MGATHER: return "X86ISD::MGATHER";
24795 /// Return true if the addressing mode represented by AM is legal for this
24796 /// target, for a load/store of the specified type.
24797 bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
24798 const AddrMode &AM, Type *Ty,
24799 unsigned AS) const {
24800 // X86 supports extremely general addressing modes.
24801 CodeModel::Model M = getTargetMachine().getCodeModel();
24803 // X86 allows a sign-extended 32-bit immediate field as a displacement.
24804 if (!X86::isOffsetSuitableForCodeModel(AM.BaseOffs, M, AM.BaseGV != nullptr))
24808 unsigned GVFlags = Subtarget.classifyGlobalReference(AM.BaseGV);
24810 // If a reference to this global requires an extra load, we can't fold it.
24811 if (isGlobalStubReference(GVFlags))
24814 // If BaseGV requires a register for the PIC base, we cannot also have a
24815 // BaseReg specified.
24816 if (AM.HasBaseReg && isGlobalRelativeToPICBase(GVFlags))
24819 // If lower 4G is not available, then we must use rip-relative addressing.
24820 if ((M != CodeModel::Small || isPositionIndependent()) &&
24821 Subtarget.is64Bit() && (AM.BaseOffs || AM.Scale > 1))
24825 switch (AM.Scale) {
24831 // These scales always work.
24836 // These scales are formed with basereg+scalereg. Only accept if there is
24841 default: // Other stuff never works.
24848 bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
24849 unsigned Bits = Ty->getScalarSizeInBits();
24851 // 8-bit shifts are always expensive, but versions with a scalar amount aren't
24852 // particularly cheaper than those without.
24856 // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
24857 // variable shifts just as cheap as scalar ones.
24858 if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
24861 // Otherwise, it's significantly cheaper to shift by a scalar amount than by a
24862 // fully general vector.
24866 bool X86TargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
24867 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24869 unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
24870 unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
24871 return NumBits1 > NumBits2;
24874 bool X86TargetLowering::allowTruncateForTailCall(Type *Ty1, Type *Ty2) const {
24875 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
24878 if (!isTypeLegal(EVT::getEVT(Ty1)))
24881 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
24883 // Assuming the caller doesn't have a zeroext or signext return parameter,
24884 // truncation all the way down to i1 is valid.
24888 bool X86TargetLowering::isLegalICmpImmediate(int64_t Imm) const {
24889 return isInt<32>(Imm);
24892 bool X86TargetLowering::isLegalAddImmediate(int64_t Imm) const {
24893 // Can also use sub to handle negated immediates.
24894 return isInt<32>(Imm);
24897 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
24898 if (!VT1.isInteger() || !VT2.isInteger())
24900 unsigned NumBits1 = VT1.getSizeInBits();
24901 unsigned NumBits2 = VT2.getSizeInBits();
24902 return NumBits1 > NumBits2;
24905 bool X86TargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
24906 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24907 return Ty1->isIntegerTy(32) && Ty2->isIntegerTy(64) && Subtarget.is64Bit();
24910 bool X86TargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
24911 // x86-64 implicitly zero-extends 32-bit results in 64-bit registers.
24912 return VT1 == MVT::i32 && VT2 == MVT::i64 && Subtarget.is64Bit();
24915 bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
24916 EVT VT1 = Val.getValueType();
24917 if (isZExtFree(VT1, VT2))
24920 if (Val.getOpcode() != ISD::LOAD)
24923 if (!VT1.isSimple() || !VT1.isInteger() ||
24924 !VT2.isSimple() || !VT2.isInteger())
24927 switch (VT1.getSimpleVT().SimpleTy) {
24932 // X86 has 8, 16, and 32-bit zero-extending loads.
24939 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue) const { return true; }
24942 X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
24943 if (!Subtarget.hasAnyFMA())
24946 VT = VT.getScalarType();
24948 if (!VT.isSimple())
24951 switch (VT.getSimpleVT().SimpleTy) {
24962 bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
24963 // i16 instructions are longer (0x66 prefix) and potentially slower.
24964 return !(VT1 == MVT::i32 && VT2 == MVT::i16);
24967 /// Targets can use this to indicate that they only support *some*
24968 /// VECTOR_SHUFFLE operations, those with specific masks.
24969 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
24970 /// are assumed to be legal.
24972 X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
24974 if (!VT.isSimple())
24977 // Not for i1 vectors
24978 if (VT.getSimpleVT().getScalarType() == MVT::i1)
24981 // Very little shuffling can be done for 64-bit vectors right now.
24982 if (VT.getSimpleVT().getSizeInBits() == 64)
24985 // We only care that the types being shuffled are legal. The lowering can
24986 // handle any possible shuffle mask that results.
24987 return isTypeLegal(VT.getSimpleVT());
24991 X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
24993 // Just delegate to the generic legality, clear masks aren't special.
24994 return isShuffleMaskLegal(Mask, VT);
24997 //===----------------------------------------------------------------------===//
24998 // X86 Scheduler Hooks
24999 //===----------------------------------------------------------------------===//
25001 /// Utility function to emit xbegin specifying the start of an RTM region.
25002 static MachineBasicBlock *emitXBegin(MachineInstr &MI, MachineBasicBlock *MBB,
25003 const TargetInstrInfo *TII) {
25004 DebugLoc DL = MI.getDebugLoc();
25006 const BasicBlock *BB = MBB->getBasicBlock();
25007 MachineFunction::iterator I = ++MBB->getIterator();
25009 // For the v = xbegin(), we generate
25018 // eax = # XABORT_DEF
25022 // v = phi(s0/mainBB, s1/fallBB)
25024 MachineBasicBlock *thisMBB = MBB;
25025 MachineFunction *MF = MBB->getParent();
25026 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
25027 MachineBasicBlock *fallMBB = MF->CreateMachineBasicBlock(BB);
25028 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
25029 MF->insert(I, mainMBB);
25030 MF->insert(I, fallMBB);
25031 MF->insert(I, sinkMBB);
25033 // Transfer the remainder of BB and its successor edges to sinkMBB.
25034 sinkMBB->splice(sinkMBB->begin(), MBB,
25035 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25036 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
25038 MachineRegisterInfo &MRI = MF->getRegInfo();
25039 unsigned DstReg = MI.getOperand(0).getReg();
25040 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
25041 unsigned mainDstReg = MRI.createVirtualRegister(RC);
25042 unsigned fallDstReg = MRI.createVirtualRegister(RC);
25046 // # fallthrough to mainMBB
25047 // # abortion to fallMBB
25048 BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(fallMBB);
25049 thisMBB->addSuccessor(mainMBB);
25050 thisMBB->addSuccessor(fallMBB);
25053 // mainDstReg := -1
25054 BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), mainDstReg).addImm(-1);
25055 BuildMI(mainMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
25056 mainMBB->addSuccessor(sinkMBB);
25059 // ; pseudo instruction to model hardware's definition from XABORT
25060 // EAX := XABORT_DEF
25061 // fallDstReg := EAX
25062 BuildMI(fallMBB, DL, TII->get(X86::XABORT_DEF));
25063 BuildMI(fallMBB, DL, TII->get(TargetOpcode::COPY), fallDstReg)
25065 fallMBB->addSuccessor(sinkMBB);
25068 // DstReg := phi(mainDstReg/mainBB, fallDstReg/fallBB)
25069 BuildMI(*sinkMBB, sinkMBB->begin(), DL, TII->get(X86::PHI), DstReg)
25070 .addReg(mainDstReg).addMBB(mainMBB)
25071 .addReg(fallDstReg).addMBB(fallMBB);
25073 MI.eraseFromParent();
25077 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
25078 // or XMM0_V32I8 in AVX all of this code can be replaced with that
25079 // in the .td file.
25080 static MachineBasicBlock *emitPCMPSTRM(MachineInstr &MI, MachineBasicBlock *BB,
25081 const TargetInstrInfo *TII) {
25083 switch (MI.getOpcode()) {
25084 default: llvm_unreachable("illegal opcode!");
25085 case X86::PCMPISTRM128REG: Opc = X86::PCMPISTRM128rr; break;
25086 case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
25087 case X86::PCMPISTRM128MEM: Opc = X86::PCMPISTRM128rm; break;
25088 case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
25089 case X86::PCMPESTRM128REG: Opc = X86::PCMPESTRM128rr; break;
25090 case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
25091 case X86::PCMPESTRM128MEM: Opc = X86::PCMPESTRM128rm; break;
25092 case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
25095 DebugLoc dl = MI.getDebugLoc();
25096 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25098 unsigned NumArgs = MI.getNumOperands();
25099 for (unsigned i = 1; i < NumArgs; ++i) {
25100 MachineOperand &Op = MI.getOperand(i);
25101 if (!(Op.isReg() && Op.isImplicit()))
25104 if (MI.hasOneMemOperand())
25105 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25107 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25108 .addReg(X86::XMM0);
25110 MI.eraseFromParent();
25114 // FIXME: Custom handling because TableGen doesn't support multiple implicit
25115 // defs in an instruction pattern
25116 static MachineBasicBlock *emitPCMPSTRI(MachineInstr &MI, MachineBasicBlock *BB,
25117 const TargetInstrInfo *TII) {
25119 switch (MI.getOpcode()) {
25120 default: llvm_unreachable("illegal opcode!");
25121 case X86::PCMPISTRIREG: Opc = X86::PCMPISTRIrr; break;
25122 case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
25123 case X86::PCMPISTRIMEM: Opc = X86::PCMPISTRIrm; break;
25124 case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
25125 case X86::PCMPESTRIREG: Opc = X86::PCMPESTRIrr; break;
25126 case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
25127 case X86::PCMPESTRIMEM: Opc = X86::PCMPESTRIrm; break;
25128 case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
25131 DebugLoc dl = MI.getDebugLoc();
25132 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
25134 unsigned NumArgs = MI.getNumOperands(); // remove the results
25135 for (unsigned i = 1; i < NumArgs; ++i) {
25136 MachineOperand &Op = MI.getOperand(i);
25137 if (!(Op.isReg() && Op.isImplicit()))
25140 if (MI.hasOneMemOperand())
25141 MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
25143 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25146 MI.eraseFromParent();
25150 static MachineBasicBlock *emitWRPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25151 const X86Subtarget &Subtarget) {
25152 DebugLoc dl = MI.getDebugLoc();
25153 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25155 // insert input VAL into EAX
25156 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EAX)
25157 .addReg(MI.getOperand(0).getReg());
25158 // insert zero to ECX
25159 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25161 // insert zero to EDX
25162 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::EDX);
25164 // insert WRPKRU instruction
25165 BuildMI(*BB, MI, dl, TII->get(X86::WRPKRUr));
25167 MI.eraseFromParent(); // The pseudo is gone now.
25171 static MachineBasicBlock *emitRDPKRU(MachineInstr &MI, MachineBasicBlock *BB,
25172 const X86Subtarget &Subtarget) {
25173 DebugLoc dl = MI.getDebugLoc();
25174 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25176 // insert zero to ECX
25177 BuildMI(*BB, MI, dl, TII->get(X86::MOV32r0), X86::ECX);
25179 // insert RDPKRU instruction
25180 BuildMI(*BB, MI, dl, TII->get(X86::RDPKRUr));
25181 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
25184 MI.eraseFromParent(); // The pseudo is gone now.
25188 static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
25189 const X86Subtarget &Subtarget,
25191 DebugLoc dl = MI.getDebugLoc();
25192 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25193 // Address into RAX/EAX, other two args into ECX, EDX.
25194 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25195 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25196 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25197 for (int i = 0; i < X86::AddrNumOperands; ++i)
25198 MIB.add(MI.getOperand(i));
25200 unsigned ValOps = X86::AddrNumOperands;
25201 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::ECX)
25202 .addReg(MI.getOperand(ValOps).getReg());
25203 BuildMI(*BB, MI, dl, TII->get(TargetOpcode::COPY), X86::EDX)
25204 .addReg(MI.getOperand(ValOps + 1).getReg());
25206 // The instruction doesn't actually take any operands though.
25207 BuildMI(*BB, MI, dl, TII->get(Opc));
25209 MI.eraseFromParent(); // The pseudo is gone now.
25213 static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
25214 const X86Subtarget &Subtarget) {
25215 DebugLoc dl = MI->getDebugLoc();
25216 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25217 // Address into RAX/EAX
25218 unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
25219 unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
25220 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
25221 for (int i = 0; i < X86::AddrNumOperands; ++i)
25222 MIB.add(MI->getOperand(i));
25224 // The instruction doesn't actually take any operands though.
25225 BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
25227 MI->eraseFromParent(); // The pseudo is gone now.
25233 MachineBasicBlock *
25234 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
25235 MachineBasicBlock *MBB) const {
25236 // Emit va_arg instruction on X86-64.
25238 // Operands to this pseudo-instruction:
25239 // 0 ) Output : destination address (reg)
25240 // 1-5) Input : va_list address (addr, i64mem)
25241 // 6 ) ArgSize : Size (in bytes) of vararg type
25242 // 7 ) ArgMode : 0=overflow only, 1=use gp_offset, 2=use fp_offset
25243 // 8 ) Align : Alignment of type
25244 // 9 ) EFLAGS (implicit-def)
25246 assert(MI.getNumOperands() == 10 && "VAARG_64 should have 10 operands!");
25247 static_assert(X86::AddrNumOperands == 5,
25248 "VAARG_64 assumes 5 address operands");
25250 unsigned DestReg = MI.getOperand(0).getReg();
25251 MachineOperand &Base = MI.getOperand(1);
25252 MachineOperand &Scale = MI.getOperand(2);
25253 MachineOperand &Index = MI.getOperand(3);
25254 MachineOperand &Disp = MI.getOperand(4);
25255 MachineOperand &Segment = MI.getOperand(5);
25256 unsigned ArgSize = MI.getOperand(6).getImm();
25257 unsigned ArgMode = MI.getOperand(7).getImm();
25258 unsigned Align = MI.getOperand(8).getImm();
25260 // Memory Reference
25261 assert(MI.hasOneMemOperand() && "Expected VAARG_64 to have one memoperand");
25262 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
25263 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
25265 // Machine Information
25266 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25267 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
25268 const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
25269 const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
25270 DebugLoc DL = MI.getDebugLoc();
25272 // struct va_list {
25275 // i64 overflow_area (address)
25276 // i64 reg_save_area (address)
25278 // sizeof(va_list) = 24
25279 // alignment(va_list) = 8
25281 unsigned TotalNumIntRegs = 6;
25282 unsigned TotalNumXMMRegs = 8;
25283 bool UseGPOffset = (ArgMode == 1);
25284 bool UseFPOffset = (ArgMode == 2);
25285 unsigned MaxOffset = TotalNumIntRegs * 8 +
25286 (UseFPOffset ? TotalNumXMMRegs * 16 : 0);
25288 /* Align ArgSize to a multiple of 8 */
25289 unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
25290 bool NeedsAlign = (Align > 8);
25292 MachineBasicBlock *thisMBB = MBB;
25293 MachineBasicBlock *overflowMBB;
25294 MachineBasicBlock *offsetMBB;
25295 MachineBasicBlock *endMBB;
25297 unsigned OffsetDestReg = 0; // Argument address computed by offsetMBB
25298 unsigned OverflowDestReg = 0; // Argument address computed by overflowMBB
25299 unsigned OffsetReg = 0;
25301 if (!UseGPOffset && !UseFPOffset) {
25302 // If we only pull from the overflow region, we don't create a branch.
25303 // We don't need to alter control flow.
25304 OffsetDestReg = 0; // unused
25305 OverflowDestReg = DestReg;
25307 offsetMBB = nullptr;
25308 overflowMBB = thisMBB;
25311 // First emit code to check if gp_offset (or fp_offset) is below the bound.
25312 // If so, pull the argument from reg_save_area. (branch to offsetMBB)
25313 // If not, pull from overflow_area. (branch to overflowMBB)
25318 // offsetMBB overflowMBB
25323 // Registers for the PHI in endMBB
25324 OffsetDestReg = MRI.createVirtualRegister(AddrRegClass);
25325 OverflowDestReg = MRI.createVirtualRegister(AddrRegClass);
25327 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25328 MachineFunction *MF = MBB->getParent();
25329 overflowMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25330 offsetMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25331 endMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25333 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25335 // Insert the new basic blocks
25336 MF->insert(MBBIter, offsetMBB);
25337 MF->insert(MBBIter, overflowMBB);
25338 MF->insert(MBBIter, endMBB);
25340 // Transfer the remainder of MBB and its successor edges to endMBB.
25341 endMBB->splice(endMBB->begin(), thisMBB,
25342 std::next(MachineBasicBlock::iterator(MI)), thisMBB->end());
25343 endMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
25345 // Make offsetMBB and overflowMBB successors of thisMBB
25346 thisMBB->addSuccessor(offsetMBB);
25347 thisMBB->addSuccessor(overflowMBB);
25349 // endMBB is a successor of both offsetMBB and overflowMBB
25350 offsetMBB->addSuccessor(endMBB);
25351 overflowMBB->addSuccessor(endMBB);
25353 // Load the offset value into a register
25354 OffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25355 BuildMI(thisMBB, DL, TII->get(X86::MOV32rm), OffsetReg)
25359 .addDisp(Disp, UseFPOffset ? 4 : 0)
25361 .setMemRefs(MMOBegin, MMOEnd);
25363 // Check if there is enough room left to pull this argument.
25364 BuildMI(thisMBB, DL, TII->get(X86::CMP32ri))
25366 .addImm(MaxOffset + 8 - ArgSizeA8);
25368 // Branch to "overflowMBB" if offset >= max
25369 // Fall through to "offsetMBB" otherwise
25370 BuildMI(thisMBB, DL, TII->get(X86::GetCondBranchFromCond(X86::COND_AE)))
25371 .addMBB(overflowMBB);
25374 // In offsetMBB, emit code to use the reg_save_area.
25376 assert(OffsetReg != 0);
25378 // Read the reg_save_area address.
25379 unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
25380 BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
25386 .setMemRefs(MMOBegin, MMOEnd);
25388 // Zero-extend the offset
25389 unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
25390 BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
25393 .addImm(X86::sub_32bit);
25395 // Add the offset to the reg_save_area to get the final address.
25396 BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
25397 .addReg(OffsetReg64)
25398 .addReg(RegSaveReg);
25400 // Compute the offset for the next argument
25401 unsigned NextOffsetReg = MRI.createVirtualRegister(OffsetRegClass);
25402 BuildMI(offsetMBB, DL, TII->get(X86::ADD32ri), NextOffsetReg)
25404 .addImm(UseFPOffset ? 16 : 8);
25406 // Store it back into the va_list.
25407 BuildMI(offsetMBB, DL, TII->get(X86::MOV32mr))
25411 .addDisp(Disp, UseFPOffset ? 4 : 0)
25413 .addReg(NextOffsetReg)
25414 .setMemRefs(MMOBegin, MMOEnd);
25417 BuildMI(offsetMBB, DL, TII->get(X86::JMP_1))
25422 // Emit code to use overflow area
25425 // Load the overflow_area address into a register.
25426 unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
25427 BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
25433 .setMemRefs(MMOBegin, MMOEnd);
25435 // If we need to align it, do so. Otherwise, just copy the address
25436 // to OverflowDestReg.
25438 // Align the overflow address
25439 assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
25440 unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
25442 // aligned_addr = (addr + (align-1)) & ~(align-1)
25443 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
25444 .addReg(OverflowAddrReg)
25447 BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
25449 .addImm(~(uint64_t)(Align-1));
25451 BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
25452 .addReg(OverflowAddrReg);
25455 // Compute the next overflow address after this argument.
25456 // (the overflow address should be kept 8-byte aligned)
25457 unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
25458 BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
25459 .addReg(OverflowDestReg)
25460 .addImm(ArgSizeA8);
25462 // Store the new overflow address.
25463 BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
25469 .addReg(NextAddrReg)
25470 .setMemRefs(MMOBegin, MMOEnd);
25472 // If we branched, emit the PHI to the front of endMBB.
25474 BuildMI(*endMBB, endMBB->begin(), DL,
25475 TII->get(X86::PHI), DestReg)
25476 .addReg(OffsetDestReg).addMBB(offsetMBB)
25477 .addReg(OverflowDestReg).addMBB(overflowMBB);
25480 // Erase the pseudo instruction
25481 MI.eraseFromParent();
25486 MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
25487 MachineInstr &MI, MachineBasicBlock *MBB) const {
25488 // Emit code to save XMM registers to the stack. The ABI says that the
25489 // number of registers to save is given in %al, so it's theoretically
25490 // possible to do an indirect jump trick to avoid saving all of them,
25491 // however this code takes a simpler approach and just executes all
25492 // of the stores if %al is non-zero. It's less code, and it's probably
25493 // easier on the hardware branch predictor, and stores aren't all that
25494 // expensive anyway.
25496 // Create the new basic blocks. One block contains all the XMM stores,
25497 // and one block is the final destination regardless of whether any
25498 // stores were performed.
25499 const BasicBlock *LLVM_BB = MBB->getBasicBlock();
25500 MachineFunction *F = MBB->getParent();
25501 MachineFunction::iterator MBBIter = ++MBB->getIterator();
25502 MachineBasicBlock *XMMSaveMBB = F->CreateMachineBasicBlock(LLVM_BB);
25503 MachineBasicBlock *EndMBB = F->CreateMachineBasicBlock(LLVM_BB);
25504 F->insert(MBBIter, XMMSaveMBB);
25505 F->insert(MBBIter, EndMBB);
25507 // Transfer the remainder of MBB and its successor edges to EndMBB.
25508 EndMBB->splice(EndMBB->begin(), MBB,
25509 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
25510 EndMBB->transferSuccessorsAndUpdatePHIs(MBB);
25512 // The original block will now fall through to the XMM save block.
25513 MBB->addSuccessor(XMMSaveMBB);
25514 // The XMMSaveMBB will fall through to the end block.
25515 XMMSaveMBB->addSuccessor(EndMBB);
25517 // Now add the instructions.
25518 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25519 DebugLoc DL = MI.getDebugLoc();
25521 unsigned CountReg = MI.getOperand(0).getReg();
25522 int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
25523 int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
25525 if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
25526 // If %al is 0, branch around the XMM save block.
25527 BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
25528 BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
25529 MBB->addSuccessor(EndMBB);
25532 // Make sure the last operand is EFLAGS, which gets clobbered by the branch
25533 // that was just emitted, but clearly shouldn't be "saved".
25534 assert((MI.getNumOperands() <= 3 ||
25535 !MI.getOperand(MI.getNumOperands() - 1).isReg() ||
25536 MI.getOperand(MI.getNumOperands() - 1).getReg() == X86::EFLAGS) &&
25537 "Expected last argument to be EFLAGS");
25538 unsigned MOVOpc = Subtarget.hasFp256() ? X86::VMOVAPSmr : X86::MOVAPSmr;
25539 // In the XMM save block, save all the XMM argument registers.
25540 for (int i = 3, e = MI.getNumOperands() - 1; i != e; ++i) {
25541 int64_t Offset = (i - 3) * 16 + VarArgsFPOffset;
25542 MachineMemOperand *MMO = F->getMachineMemOperand(
25543 MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
25544 MachineMemOperand::MOStore,
25545 /*Size=*/16, /*Align=*/16);
25546 BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
25547 .addFrameIndex(RegSaveFrameIndex)
25548 .addImm(/*Scale=*/1)
25549 .addReg(/*IndexReg=*/0)
25550 .addImm(/*Disp=*/Offset)
25551 .addReg(/*Segment=*/0)
25552 .addReg(MI.getOperand(i).getReg())
25553 .addMemOperand(MMO);
25556 MI.eraseFromParent(); // The pseudo instruction is gone now.
25561 // The EFLAGS operand of SelectItr might be missing a kill marker
25562 // because there were multiple uses of EFLAGS, and ISel didn't know
25563 // which to mark. Figure out whether SelectItr should have had a
25564 // kill marker, and set it if it should. Returns the correct kill
25566 static bool checkAndUpdateEFLAGSKill(MachineBasicBlock::iterator SelectItr,
25567 MachineBasicBlock* BB,
25568 const TargetRegisterInfo* TRI) {
25569 // Scan forward through BB for a use/def of EFLAGS.
25570 MachineBasicBlock::iterator miI(std::next(SelectItr));
25571 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
25572 const MachineInstr& mi = *miI;
25573 if (mi.readsRegister(X86::EFLAGS))
25575 if (mi.definesRegister(X86::EFLAGS))
25576 break; // Should have kill-flag - update below.
25579 // If we hit the end of the block, check whether EFLAGS is live into a
25581 if (miI == BB->end()) {
25582 for (MachineBasicBlock::succ_iterator sItr = BB->succ_begin(),
25583 sEnd = BB->succ_end();
25584 sItr != sEnd; ++sItr) {
25585 MachineBasicBlock* succ = *sItr;
25586 if (succ->isLiveIn(X86::EFLAGS))
25591 // We found a def, or hit the end of the basic block and EFLAGS wasn't live
25592 // out. SelectMI should have a kill flag on EFLAGS.
25593 SelectItr->addRegisterKilled(X86::EFLAGS, TRI);
25597 // Return true if it is OK for this CMOV pseudo-opcode to be cascaded
25598 // together with other CMOV pseudo-opcodes into a single basic-block with
25599 // conditional jump around it.
25600 static bool isCMOVPseudo(MachineInstr &MI) {
25601 switch (MI.getOpcode()) {
25602 case X86::CMOV_FR32:
25603 case X86::CMOV_FR64:
25604 case X86::CMOV_GR8:
25605 case X86::CMOV_GR16:
25606 case X86::CMOV_GR32:
25607 case X86::CMOV_RFP32:
25608 case X86::CMOV_RFP64:
25609 case X86::CMOV_RFP80:
25610 case X86::CMOV_V2F64:
25611 case X86::CMOV_V2I64:
25612 case X86::CMOV_V4F32:
25613 case X86::CMOV_V4F64:
25614 case X86::CMOV_V4I64:
25615 case X86::CMOV_V16F32:
25616 case X86::CMOV_V8F32:
25617 case X86::CMOV_V8F64:
25618 case X86::CMOV_V8I64:
25619 case X86::CMOV_V8I1:
25620 case X86::CMOV_V16I1:
25621 case X86::CMOV_V32I1:
25622 case X86::CMOV_V64I1:
25630 MachineBasicBlock *
25631 X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
25632 MachineBasicBlock *BB) const {
25633 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25634 DebugLoc DL = MI.getDebugLoc();
25636 // To "insert" a SELECT_CC instruction, we actually have to insert the
25637 // diamond control-flow pattern. The incoming instruction knows the
25638 // destination vreg to set, the condition code register to branch on, the
25639 // true/false values to select between, and a branch opcode to use.
25640 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25641 MachineFunction::iterator It = ++BB->getIterator();
25646 // cmpTY ccX, r1, r2
25648 // fallthrough --> copy0MBB
25649 MachineBasicBlock *thisMBB = BB;
25650 MachineFunction *F = BB->getParent();
25652 // This code lowers all pseudo-CMOV instructions. Generally it lowers these
25653 // as described above, by inserting a BB, and then making a PHI at the join
25654 // point to select the true and false operands of the CMOV in the PHI.
25656 // The code also handles two different cases of multiple CMOV opcodes
25660 // In this case, there are multiple CMOVs in a row, all which are based on
25661 // the same condition setting (or the exact opposite condition setting).
25662 // In this case we can lower all the CMOVs using a single inserted BB, and
25663 // then make a number of PHIs at the join point to model the CMOVs. The only
25664 // trickiness here, is that in a case like:
25666 // t2 = CMOV cond1 t1, f1
25667 // t3 = CMOV cond1 t2, f2
25669 // when rewriting this into PHIs, we have to perform some renaming on the
25670 // temps since you cannot have a PHI operand refer to a PHI result earlier
25671 // in the same block. The "simple" but wrong lowering would be:
25673 // t2 = PHI t1(BB1), f1(BB2)
25674 // t3 = PHI t2(BB1), f2(BB2)
25676 // but clearly t2 is not defined in BB1, so that is incorrect. The proper
25677 // renaming is to note that on the path through BB1, t2 is really just a
25678 // copy of t1, and do that renaming, properly generating:
25680 // t2 = PHI t1(BB1), f1(BB2)
25681 // t3 = PHI t1(BB1), f2(BB2)
25683 // Case 2, we lower cascaded CMOVs such as
25685 // (CMOV (CMOV F, T, cc1), T, cc2)
25687 // to two successive branches. For that, we look for another CMOV as the
25688 // following instruction.
25690 // Without this, we would add a PHI between the two jumps, which ends up
25691 // creating a few copies all around. For instance, for
25693 // (sitofp (zext (fcmp une)))
25695 // we would generate:
25697 // ucomiss %xmm1, %xmm0
25698 // movss <1.0f>, %xmm0
25699 // movaps %xmm0, %xmm1
25701 // xorps %xmm1, %xmm1
25704 // movaps %xmm1, %xmm0
25708 // because this custom-inserter would have generated:
25720 // A: X = ...; Y = ...
25722 // C: Z = PHI [X, A], [Y, B]
25724 // E: PHI [X, C], [Z, D]
25726 // If we lower both CMOVs in a single step, we can instead generate:
25738 // A: X = ...; Y = ...
25740 // E: PHI [X, A], [X, C], [Y, D]
25742 // Which, in our sitofp/fcmp example, gives us something like:
25744 // ucomiss %xmm1, %xmm0
25745 // movss <1.0f>, %xmm0
25748 // xorps %xmm0, %xmm0
25752 MachineInstr *CascadedCMOV = nullptr;
25753 MachineInstr *LastCMOV = &MI;
25754 X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
25755 X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
25756 MachineBasicBlock::iterator NextMIIt =
25757 std::next(MachineBasicBlock::iterator(MI));
25759 // Check for case 1, where there are multiple CMOVs with the same condition
25760 // first. Of the two cases of multiple CMOV lowerings, case 1 reduces the
25761 // number of jumps the most.
25763 if (isCMOVPseudo(MI)) {
25764 // See if we have a string of CMOVS with the same condition.
25765 while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
25766 (NextMIIt->getOperand(3).getImm() == CC ||
25767 NextMIIt->getOperand(3).getImm() == OppCC)) {
25768 LastCMOV = &*NextMIIt;
25773 // This checks for case 2, but only do this if we didn't already find
25774 // case 1, as indicated by LastCMOV == MI.
25775 if (LastCMOV == &MI && NextMIIt != BB->end() &&
25776 NextMIIt->getOpcode() == MI.getOpcode() &&
25777 NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
25778 NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
25779 NextMIIt->getOperand(1).isKill()) {
25780 CascadedCMOV = &*NextMIIt;
25783 MachineBasicBlock *jcc1MBB = nullptr;
25785 // If we have a cascaded CMOV, we lower it to two successive branches to
25786 // the same block. EFLAGS is used by both, so mark it as live in the second.
25787 if (CascadedCMOV) {
25788 jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
25789 F->insert(It, jcc1MBB);
25790 jcc1MBB->addLiveIn(X86::EFLAGS);
25793 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
25794 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
25795 F->insert(It, copy0MBB);
25796 F->insert(It, sinkMBB);
25798 // If the EFLAGS register isn't dead in the terminator, then claim that it's
25799 // live into the sink and copy blocks.
25800 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
25802 MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
25803 if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
25804 !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
25805 copy0MBB->addLiveIn(X86::EFLAGS);
25806 sinkMBB->addLiveIn(X86::EFLAGS);
25809 // Transfer the remainder of BB and its successor edges to sinkMBB.
25810 sinkMBB->splice(sinkMBB->begin(), BB,
25811 std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
25812 sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
25814 // Add the true and fallthrough blocks as its successors.
25815 if (CascadedCMOV) {
25816 // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
25817 BB->addSuccessor(jcc1MBB);
25819 // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
25820 // jump to the sinkMBB.
25821 jcc1MBB->addSuccessor(copy0MBB);
25822 jcc1MBB->addSuccessor(sinkMBB);
25824 BB->addSuccessor(copy0MBB);
25827 // The true block target of the first (or only) branch is always sinkMBB.
25828 BB->addSuccessor(sinkMBB);
25830 // Create the conditional branch instruction.
25831 unsigned Opc = X86::GetCondBranchFromCond(CC);
25832 BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
25834 if (CascadedCMOV) {
25835 unsigned Opc2 = X86::GetCondBranchFromCond(
25836 (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
25837 BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
25841 // %FalseValue = ...
25842 // # fallthrough to sinkMBB
25843 copy0MBB->addSuccessor(sinkMBB);
25846 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
25848 MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
25849 MachineBasicBlock::iterator MIItEnd =
25850 std::next(MachineBasicBlock::iterator(LastCMOV));
25851 MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
25852 DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
25853 MachineInstrBuilder MIB;
25855 // As we are creating the PHIs, we have to be careful if there is more than
25856 // one. Later CMOVs may reference the results of earlier CMOVs, but later
25857 // PHIs have to reference the individual true/false inputs from earlier PHIs.
25858 // That also means that PHI construction must work forward from earlier to
25859 // later, and that the code must maintain a mapping from earlier PHI's
25860 // destination registers, and the registers that went into the PHI.
25862 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
25863 unsigned DestReg = MIIt->getOperand(0).getReg();
25864 unsigned Op1Reg = MIIt->getOperand(1).getReg();
25865 unsigned Op2Reg = MIIt->getOperand(2).getReg();
25867 // If this CMOV we are generating is the opposite condition from
25868 // the jump we generated, then we have to swap the operands for the
25869 // PHI that is going to be generated.
25870 if (MIIt->getOperand(3).getImm() == OppCC)
25871 std::swap(Op1Reg, Op2Reg);
25873 if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
25874 Op1Reg = RegRewriteTable[Op1Reg].first;
25876 if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
25877 Op2Reg = RegRewriteTable[Op2Reg].second;
25879 MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
25880 TII->get(X86::PHI), DestReg)
25881 .addReg(Op1Reg).addMBB(copy0MBB)
25882 .addReg(Op2Reg).addMBB(thisMBB);
25884 // Add this PHI to the rewrite table.
25885 RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
25888 // If we have a cascaded CMOV, the second Jcc provides the same incoming
25889 // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
25890 if (CascadedCMOV) {
25891 MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
25892 // Copy the PHI result to the register defined by the second CMOV.
25893 BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
25894 DL, TII->get(TargetOpcode::COPY),
25895 CascadedCMOV->getOperand(0).getReg())
25896 .addReg(MI.getOperand(0).getReg());
25897 CascadedCMOV->eraseFromParent();
25900 // Now remove the CMOV(s).
25901 for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
25902 (MIIt++)->eraseFromParent();
25907 MachineBasicBlock *
25908 X86TargetLowering::EmitLoweredAtomicFP(MachineInstr &MI,
25909 MachineBasicBlock *BB) const {
25910 // Combine the following atomic floating-point modification pattern:
25911 // a.store(reg OP a.load(acquire), release)
25912 // Transform them into:
25913 // OPss (%gpr), %xmm
25914 // movss %xmm, (%gpr)
25915 // Or sd equivalent for 64-bit operations.
25917 switch (MI.getOpcode()) {
25918 default: llvm_unreachable("unexpected instr type for EmitLoweredAtomicFP");
25919 case X86::RELEASE_FADD32mr:
25920 FOp = X86::ADDSSrm;
25921 MOp = X86::MOVSSmr;
25923 case X86::RELEASE_FADD64mr:
25924 FOp = X86::ADDSDrm;
25925 MOp = X86::MOVSDmr;
25928 const X86InstrInfo *TII = Subtarget.getInstrInfo();
25929 DebugLoc DL = MI.getDebugLoc();
25930 MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
25931 unsigned ValOpIdx = X86::AddrNumOperands;
25932 unsigned VSrc = MI.getOperand(ValOpIdx).getReg();
25933 MachineInstrBuilder MIB =
25934 BuildMI(*BB, MI, DL, TII->get(FOp),
25935 MRI.createVirtualRegister(MRI.getRegClass(VSrc)))
25937 for (int i = 0; i < X86::AddrNumOperands; ++i) {
25938 MachineOperand &Operand = MI.getOperand(i);
25939 // Clear any kill flags on register operands as we'll create a second
25940 // instruction using the same address operands.
25941 if (Operand.isReg())
25942 Operand.setIsKill(false);
25945 MachineInstr *FOpMI = MIB;
25946 MIB = BuildMI(*BB, MI, DL, TII->get(MOp));
25947 for (int i = 0; i < X86::AddrNumOperands; ++i)
25948 MIB.add(MI.getOperand(i));
25949 MIB.addReg(FOpMI->getOperand(0).getReg(), RegState::Kill);
25950 MI.eraseFromParent(); // The pseudo instruction is gone now.
25954 MachineBasicBlock *
25955 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
25956 MachineBasicBlock *BB) const {
25957 MachineFunction *MF = BB->getParent();
25958 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
25959 DebugLoc DL = MI.getDebugLoc();
25960 const BasicBlock *LLVM_BB = BB->getBasicBlock();
25962 assert(MF->shouldSplitStack());
25964 const bool Is64Bit = Subtarget.is64Bit();
25965 const bool IsLP64 = Subtarget.isTarget64BitLP64();
25967 const unsigned TlsReg = Is64Bit ? X86::FS : X86::GS;
25968 const unsigned TlsOffset = IsLP64 ? 0x70 : Is64Bit ? 0x40 : 0x30;
25971 // ... [Till the alloca]
25972 // If stacklet is not large enough, jump to mallocMBB
25975 // Allocate by subtracting from RSP
25976 // Jump to continueMBB
25979 // Allocate by call to runtime
25983 // [rest of original BB]
25986 MachineBasicBlock *mallocMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25987 MachineBasicBlock *bumpMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25988 MachineBasicBlock *continueMBB = MF->CreateMachineBasicBlock(LLVM_BB);
25990 MachineRegisterInfo &MRI = MF->getRegInfo();
25991 const TargetRegisterClass *AddrRegClass =
25992 getRegClassFor(getPointerTy(MF->getDataLayout()));
25994 unsigned mallocPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25995 bumpSPPtrVReg = MRI.createVirtualRegister(AddrRegClass),
25996 tmpSPVReg = MRI.createVirtualRegister(AddrRegClass),
25997 SPLimitVReg = MRI.createVirtualRegister(AddrRegClass),
25998 sizeVReg = MI.getOperand(1).getReg(),
26000 IsLP64 || Subtarget.isTargetNaCl64() ? X86::RSP : X86::ESP;
26002 MachineFunction::iterator MBBIter = ++BB->getIterator();
26004 MF->insert(MBBIter, bumpMBB);
26005 MF->insert(MBBIter, mallocMBB);
26006 MF->insert(MBBIter, continueMBB);
26008 continueMBB->splice(continueMBB->begin(), BB,
26009 std::next(MachineBasicBlock::iterator(MI)), BB->end());
26010 continueMBB->transferSuccessorsAndUpdatePHIs(BB);
26012 // Add code to the main basic block to check if the stack limit has been hit,
26013 // and if so, jump to mallocMBB otherwise to bumpMBB.
26014 BuildMI(BB, DL, TII->get(TargetOpcode::COPY), tmpSPVReg).addReg(physSPReg);
26015 BuildMI(BB, DL, TII->get(IsLP64 ? X86::SUB64rr:X86::SUB32rr), SPLimitVReg)
26016 .addReg(tmpSPVReg).addReg(sizeVReg);
26017 BuildMI(BB, DL, TII->get(IsLP64 ? X86::CMP64mr:X86::CMP32mr))
26018 .addReg(0).addImm(1).addReg(0).addImm(TlsOffset).addReg(TlsReg)
26019 .addReg(SPLimitVReg);
26020 BuildMI(BB, DL, TII->get(X86::JG_1)).addMBB(mallocMBB);
26022 // bumpMBB simply decreases the stack pointer, since we know the current
26023 // stacklet has enough space.
26024 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), physSPReg)
26025 .addReg(SPLimitVReg);
26026 BuildMI(bumpMBB, DL, TII->get(TargetOpcode::COPY), bumpSPPtrVReg)
26027 .addReg(SPLimitVReg);
26028 BuildMI(bumpMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26030 // Calls into a routine in libgcc to allocate more space from the heap.
26031 const uint32_t *RegMask =
26032 Subtarget.getRegisterInfo()->getCallPreservedMask(*MF, CallingConv::C);
26034 BuildMI(mallocMBB, DL, TII->get(X86::MOV64rr), X86::RDI)
26036 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26037 .addExternalSymbol("__morestack_allocate_stack_space")
26038 .addRegMask(RegMask)
26039 .addReg(X86::RDI, RegState::Implicit)
26040 .addReg(X86::RAX, RegState::ImplicitDefine);
26041 } else if (Is64Bit) {
26042 BuildMI(mallocMBB, DL, TII->get(X86::MOV32rr), X86::EDI)
26044 BuildMI(mallocMBB, DL, TII->get(X86::CALL64pcrel32))
26045 .addExternalSymbol("__morestack_allocate_stack_space")
26046 .addRegMask(RegMask)
26047 .addReg(X86::EDI, RegState::Implicit)
26048 .addReg(X86::EAX, RegState::ImplicitDefine);
26050 BuildMI(mallocMBB, DL, TII->get(X86::SUB32ri), physSPReg).addReg(physSPReg)
26052 BuildMI(mallocMBB, DL, TII->get(X86::PUSH32r)).addReg(sizeVReg);
26053 BuildMI(mallocMBB, DL, TII->get(X86::CALLpcrel32))
26054 .addExternalSymbol("__morestack_allocate_stack_space")
26055 .addRegMask(RegMask)
26056 .addReg(X86::EAX, RegState::ImplicitDefine);
26060 BuildMI(mallocMBB, DL, TII->get(X86::ADD32ri), physSPReg).addReg(physSPReg)
26063 BuildMI(mallocMBB, DL, TII->get(TargetOpcode::COPY), mallocPtrVReg)
26064 .addReg(IsLP64 ? X86::RAX : X86::EAX);
26065 BuildMI(mallocMBB, DL, TII->get(X86::JMP_1)).addMBB(continueMBB);
26067 // Set up the CFG correctly.
26068 BB->addSuccessor(bumpMBB);
26069 BB->addSuccessor(mallocMBB);
26070 mallocMBB->addSuccessor(continueMBB);
26071 bumpMBB->addSuccessor(continueMBB);
26073 // Take care of the PHI nodes.
26074 BuildMI(*continueMBB, continueMBB->begin(), DL, TII->get(X86::PHI),
26075 MI.getOperand(0).getReg())
26076 .addReg(mallocPtrVReg)
26078 .addReg(bumpSPPtrVReg)
26081 // Delete the original pseudo instruction.
26082 MI.eraseFromParent();
26085 return continueMBB;
26088 MachineBasicBlock *
26089 X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
26090 MachineBasicBlock *BB) const {
26091 MachineFunction *MF = BB->getParent();
26092 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26093 MachineBasicBlock *TargetMBB = MI.getOperand(0).getMBB();
26094 DebugLoc DL = MI.getDebugLoc();
26096 assert(!isAsynchronousEHPersonality(
26097 classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
26098 "SEH does not use catchret!");
26100 // Only 32-bit EH needs to worry about manually restoring stack pointers.
26101 if (!Subtarget.is32Bit())
26104 // C++ EH creates a new target block to hold the restore code, and wires up
26105 // the new block to the return destination with a normal JMP_4.
26106 MachineBasicBlock *RestoreMBB =
26107 MF->CreateMachineBasicBlock(BB->getBasicBlock());
26108 assert(BB->succ_size() == 1);
26109 MF->insert(std::next(BB->getIterator()), RestoreMBB);
26110 RestoreMBB->transferSuccessorsAndUpdatePHIs(BB);
26111 BB->addSuccessor(RestoreMBB);
26112 MI.getOperand(0).setMBB(RestoreMBB);
26114 auto RestoreMBBI = RestoreMBB->begin();
26115 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
26116 BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
26120 MachineBasicBlock *
26121 X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
26122 MachineBasicBlock *BB) const {
26123 MachineFunction *MF = BB->getParent();
26124 const Constant *PerFn = MF->getFunction()->getPersonalityFn();
26125 bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
26126 // Only 32-bit SEH requires special handling for catchpad.
26127 if (IsSEH && Subtarget.is32Bit()) {
26128 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26129 DebugLoc DL = MI.getDebugLoc();
26130 BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
26132 MI.eraseFromParent();
26136 MachineBasicBlock *
26137 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
26138 MachineBasicBlock *BB) const {
26139 // So, here we replace TLSADDR with the sequence:
26140 // adjust_stackdown -> TLSADDR -> adjust_stackup.
26141 // We need this because TLSADDR is lowered into calls
26142 // inside MC, therefore without the two markers shrink-wrapping
26143 // may push the prologue/epilogue pass them.
26144 const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
26145 DebugLoc DL = MI.getDebugLoc();
26146 MachineFunction &MF = *BB->getParent();
26148 // Emit CALLSEQ_START right before the instruction.
26149 unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
26150 MachineInstrBuilder CallseqStart =
26151 BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0);
26152 BB->insert(MachineBasicBlock::iterator(MI), CallseqStart);
26154 // Emit CALLSEQ_END right after the instruction.
26155 // We don't call erase from parent because we want to keep the
26156 // original instruction around.
26157 unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
26158 MachineInstrBuilder CallseqEnd =
26159 BuildMI(MF, DL, TII.get(AdjStackUp)).addImm(0).addImm(0);
26160 BB->insertAfter(MachineBasicBlock::iterator(MI), CallseqEnd);
26165 MachineBasicBlock *
26166 X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
26167 MachineBasicBlock *BB) const {
26168 // This is pretty easy. We're taking the value that we received from
26169 // our load from the relocation, sticking it in either RDI (x86-64)
26170 // or EAX and doing an indirect call. The return value will then
26171 // be in the normal return register.
26172 MachineFunction *F = BB->getParent();
26173 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26174 DebugLoc DL = MI.getDebugLoc();
26176 assert(Subtarget.isTargetDarwin() && "Darwin only instr emitted?");
26177 assert(MI.getOperand(3).isGlobal() && "This should be a global");
26179 // Get a register mask for the lowered call.
26180 // FIXME: The 32-bit calls have non-standard calling conventions. Use a
26181 // proper register mask.
26182 const uint32_t *RegMask =
26183 Subtarget.is64Bit() ?
26184 Subtarget.getRegisterInfo()->getDarwinTLSCallPreservedMask() :
26185 Subtarget.getRegisterInfo()->getCallPreservedMask(*F, CallingConv::C);
26186 if (Subtarget.is64Bit()) {
26187 MachineInstrBuilder MIB =
26188 BuildMI(*BB, MI, DL, TII->get(X86::MOV64rm), X86::RDI)
26192 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26193 MI.getOperand(3).getTargetFlags())
26195 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL64m));
26196 addDirectMem(MIB, X86::RDI);
26197 MIB.addReg(X86::RAX, RegState::ImplicitDefine).addRegMask(RegMask);
26198 } else if (!isPositionIndependent()) {
26199 MachineInstrBuilder MIB =
26200 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26204 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26205 MI.getOperand(3).getTargetFlags())
26207 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26208 addDirectMem(MIB, X86::EAX);
26209 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26211 MachineInstrBuilder MIB =
26212 BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), X86::EAX)
26213 .addReg(TII->getGlobalBaseReg(F))
26216 .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
26217 MI.getOperand(3).getTargetFlags())
26219 MIB = BuildMI(*BB, MI, DL, TII->get(X86::CALL32m));
26220 addDirectMem(MIB, X86::EAX);
26221 MIB.addReg(X86::EAX, RegState::ImplicitDefine).addRegMask(RegMask);
26224 MI.eraseFromParent(); // The pseudo instruction is gone now.
26228 MachineBasicBlock *
26229 X86TargetLowering::emitEHSjLjSetJmp(MachineInstr &MI,
26230 MachineBasicBlock *MBB) const {
26231 DebugLoc DL = MI.getDebugLoc();
26232 MachineFunction *MF = MBB->getParent();
26233 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26234 const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
26235 MachineRegisterInfo &MRI = MF->getRegInfo();
26237 const BasicBlock *BB = MBB->getBasicBlock();
26238 MachineFunction::iterator I = ++MBB->getIterator();
26240 // Memory Reference
26241 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26242 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26245 unsigned MemOpndSlot = 0;
26247 unsigned CurOp = 0;
26249 DstReg = MI.getOperand(CurOp++).getReg();
26250 const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
26251 assert(TRI->isTypeLegalForClass(*RC, MVT::i32) && "Invalid destination!");
26253 unsigned mainDstReg = MRI.createVirtualRegister(RC);
26254 unsigned restoreDstReg = MRI.createVirtualRegister(RC);
26256 MemOpndSlot = CurOp;
26258 MVT PVT = getPointerTy(MF->getDataLayout());
26259 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26260 "Invalid Pointer Size!");
26262 // For v = setjmp(buf), we generate
26265 // buf[LabelOffset] = restoreMBB <-- takes address of restoreMBB
26266 // SjLjSetup restoreMBB
26272 // v = phi(main, restore)
26275 // if base pointer being used, load it from frame
26278 MachineBasicBlock *thisMBB = MBB;
26279 MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
26280 MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
26281 MachineBasicBlock *restoreMBB = MF->CreateMachineBasicBlock(BB);
26282 MF->insert(I, mainMBB);
26283 MF->insert(I, sinkMBB);
26284 MF->push_back(restoreMBB);
26285 restoreMBB->setHasAddressTaken();
26287 MachineInstrBuilder MIB;
26289 // Transfer the remainder of BB and its successor edges to sinkMBB.
26290 sinkMBB->splice(sinkMBB->begin(), MBB,
26291 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
26292 sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
26295 unsigned PtrStoreOpc = 0;
26296 unsigned LabelReg = 0;
26297 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26298 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26299 !isPositionIndependent();
26301 // Prepare IP either in reg or imm.
26302 if (!UseImmLabel) {
26303 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26304 const TargetRegisterClass *PtrRC = getRegClassFor(PVT);
26305 LabelReg = MRI.createVirtualRegister(PtrRC);
26306 if (Subtarget.is64Bit()) {
26307 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA64r), LabelReg)
26311 .addMBB(restoreMBB)
26314 const X86InstrInfo *XII = static_cast<const X86InstrInfo*>(TII);
26315 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::LEA32r), LabelReg)
26316 .addReg(XII->getGlobalBaseReg(MF))
26319 .addMBB(restoreMBB, Subtarget.classifyBlockAddressReference())
26323 PtrStoreOpc = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26325 MIB = BuildMI(*thisMBB, MI, DL, TII->get(PtrStoreOpc));
26326 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26327 if (i == X86::AddrDisp)
26328 MIB.addDisp(MI.getOperand(MemOpndSlot + i), LabelOffset);
26330 MIB.add(MI.getOperand(MemOpndSlot + i));
26333 MIB.addReg(LabelReg);
26335 MIB.addMBB(restoreMBB);
26336 MIB.setMemRefs(MMOBegin, MMOEnd);
26338 MIB = BuildMI(*thisMBB, MI, DL, TII->get(X86::EH_SjLj_Setup))
26339 .addMBB(restoreMBB);
26341 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26342 MIB.addRegMask(RegInfo->getNoPreservedMask());
26343 thisMBB->addSuccessor(mainMBB);
26344 thisMBB->addSuccessor(restoreMBB);
26348 BuildMI(mainMBB, DL, TII->get(X86::MOV32r0), mainDstReg);
26349 mainMBB->addSuccessor(sinkMBB);
26352 BuildMI(*sinkMBB, sinkMBB->begin(), DL,
26353 TII->get(X86::PHI), DstReg)
26354 .addReg(mainDstReg).addMBB(mainMBB)
26355 .addReg(restoreDstReg).addMBB(restoreMBB);
26358 if (RegInfo->hasBasePointer(*MF)) {
26359 const bool Uses64BitFramePtr =
26360 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26361 X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
26362 X86FI->setRestoreBasePointer(MF);
26363 unsigned FramePtr = RegInfo->getFrameRegister(*MF);
26364 unsigned BasePtr = RegInfo->getBaseRegister();
26365 unsigned Opm = Uses64BitFramePtr ? X86::MOV64rm : X86::MOV32rm;
26366 addRegOffset(BuildMI(restoreMBB, DL, TII->get(Opm), BasePtr),
26367 FramePtr, true, X86FI->getRestoreBasePointerOffset())
26368 .setMIFlag(MachineInstr::FrameSetup);
26370 BuildMI(restoreMBB, DL, TII->get(X86::MOV32ri), restoreDstReg).addImm(1);
26371 BuildMI(restoreMBB, DL, TII->get(X86::JMP_1)).addMBB(sinkMBB);
26372 restoreMBB->addSuccessor(sinkMBB);
26374 MI.eraseFromParent();
26378 MachineBasicBlock *
26379 X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
26380 MachineBasicBlock *MBB) const {
26381 DebugLoc DL = MI.getDebugLoc();
26382 MachineFunction *MF = MBB->getParent();
26383 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26384 MachineRegisterInfo &MRI = MF->getRegInfo();
26386 // Memory Reference
26387 MachineInstr::mmo_iterator MMOBegin = MI.memoperands_begin();
26388 MachineInstr::mmo_iterator MMOEnd = MI.memoperands_end();
26390 MVT PVT = getPointerTy(MF->getDataLayout());
26391 assert((PVT == MVT::i64 || PVT == MVT::i32) &&
26392 "Invalid Pointer Size!");
26394 const TargetRegisterClass *RC =
26395 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26396 unsigned Tmp = MRI.createVirtualRegister(RC);
26397 // Since FP is only updated here but NOT referenced, it's treated as GPR.
26398 const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
26399 unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
26400 unsigned SP = RegInfo->getStackRegister();
26402 MachineInstrBuilder MIB;
26404 const int64_t LabelOffset = 1 * PVT.getStoreSize();
26405 const int64_t SPOffset = 2 * PVT.getStoreSize();
26407 unsigned PtrLoadOpc = (PVT == MVT::i64) ? X86::MOV64rm : X86::MOV32rm;
26408 unsigned IJmpOpc = (PVT == MVT::i64) ? X86::JMP64r : X86::JMP32r;
26411 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), FP);
26412 for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
26413 MIB.add(MI.getOperand(i));
26414 MIB.setMemRefs(MMOBegin, MMOEnd);
26416 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), Tmp);
26417 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26418 if (i == X86::AddrDisp)
26419 MIB.addDisp(MI.getOperand(i), LabelOffset);
26421 MIB.add(MI.getOperand(i));
26423 MIB.setMemRefs(MMOBegin, MMOEnd);
26425 MIB = BuildMI(*MBB, MI, DL, TII->get(PtrLoadOpc), SP);
26426 for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
26427 if (i == X86::AddrDisp)
26428 MIB.addDisp(MI.getOperand(i), SPOffset);
26430 MIB.add(MI.getOperand(i));
26432 MIB.setMemRefs(MMOBegin, MMOEnd);
26434 BuildMI(*MBB, MI, DL, TII->get(IJmpOpc)).addReg(Tmp);
26436 MI.eraseFromParent();
26440 void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
26441 MachineBasicBlock *MBB,
26442 MachineBasicBlock *DispatchBB,
26444 DebugLoc DL = MI.getDebugLoc();
26445 MachineFunction *MF = MBB->getParent();
26446 MachineRegisterInfo *MRI = &MF->getRegInfo();
26447 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26449 MVT PVT = getPointerTy(MF->getDataLayout());
26450 assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
26455 bool UseImmLabel = (MF->getTarget().getCodeModel() == CodeModel::Small) &&
26456 !isPositionIndependent();
26459 Op = (PVT == MVT::i64) ? X86::MOV64mi32 : X86::MOV32mi;
26461 const TargetRegisterClass *TRC =
26462 (PVT == MVT::i64) ? &X86::GR64RegClass : &X86::GR32RegClass;
26463 VR = MRI->createVirtualRegister(TRC);
26464 Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
26466 if (Subtarget.is64Bit())
26467 BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
26471 .addMBB(DispatchBB)
26474 BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
26475 .addReg(0) /* TII->getGlobalBaseReg(MF) */
26478 .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
26482 MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
26483 addFrameReference(MIB, FI, 36);
26485 MIB.addMBB(DispatchBB);
26490 MachineBasicBlock *
26491 X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
26492 MachineBasicBlock *BB) const {
26493 DebugLoc DL = MI.getDebugLoc();
26494 MachineFunction *MF = BB->getParent();
26495 MachineFrameInfo &MFI = MF->getFrameInfo();
26496 MachineRegisterInfo *MRI = &MF->getRegInfo();
26497 const X86InstrInfo *TII = Subtarget.getInstrInfo();
26498 int FI = MFI.getFunctionContextIndex();
26500 // Get a mapping of the call site numbers to all of the landing pads they're
26501 // associated with.
26502 DenseMap<unsigned, SmallVector<MachineBasicBlock *, 2>> CallSiteNumToLPad;
26503 unsigned MaxCSNum = 0;
26504 for (auto &MBB : *MF) {
26505 if (!MBB.isEHPad())
26508 MCSymbol *Sym = nullptr;
26509 for (const auto &MI : MBB) {
26510 if (MI.isDebugValue())
26513 assert(MI.isEHLabel() && "expected EH_LABEL");
26514 Sym = MI.getOperand(0).getMCSymbol();
26518 if (!MF->hasCallSiteLandingPad(Sym))
26521 for (unsigned CSI : MF->getCallSiteLandingPad(Sym)) {
26522 CallSiteNumToLPad[CSI].push_back(&MBB);
26523 MaxCSNum = std::max(MaxCSNum, CSI);
26527 // Get an ordered list of the machine basic blocks for the jump table.
26528 std::vector<MachineBasicBlock *> LPadList;
26529 SmallPtrSet<MachineBasicBlock *, 32> InvokeBBs;
26530 LPadList.reserve(CallSiteNumToLPad.size());
26532 for (unsigned CSI = 1; CSI <= MaxCSNum; ++CSI) {
26533 for (auto &LP : CallSiteNumToLPad[CSI]) {
26534 LPadList.push_back(LP);
26535 InvokeBBs.insert(LP->pred_begin(), LP->pred_end());
26539 assert(!LPadList.empty() &&
26540 "No landing pad destinations for the dispatch jump table!");
26542 // Create the MBBs for the dispatch code.
26544 // Shove the dispatch's address into the return slot in the function context.
26545 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
26546 DispatchBB->setIsEHPad(true);
26548 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
26549 BuildMI(TrapBB, DL, TII->get(X86::TRAP));
26550 DispatchBB->addSuccessor(TrapBB);
26552 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
26553 DispatchBB->addSuccessor(DispContBB);
26556 MF->push_back(DispatchBB);
26557 MF->push_back(DispContBB);
26558 MF->push_back(TrapBB);
26560 // Insert code into the entry block that creates and registers the function
26562 SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
26564 // Create the jump table and associated information
26565 MachineJumpTableInfo *JTI =
26566 MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
26567 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
26569 const X86RegisterInfo &RI = TII->getRegisterInfo();
26570 // Add a register mask with no preserved registers. This results in all
26571 // registers being marked as clobbered.
26572 if (RI.hasBasePointer(*MF)) {
26573 const bool FPIs64Bit =
26574 Subtarget.isTarget64BitLP64() || Subtarget.isTargetNaCl64();
26575 X86MachineFunctionInfo *MFI = MF->getInfo<X86MachineFunctionInfo>();
26576 MFI->setRestoreBasePointer(MF);
26578 unsigned FP = RI.getFrameRegister(*MF);
26579 unsigned BP = RI.getBaseRegister();
26580 unsigned Op = FPIs64Bit ? X86::MOV64rm : X86::MOV32rm;
26581 addRegOffset(BuildMI(DispatchBB, DL, TII->get(Op), BP), FP, true,
26582 MFI->getRestoreBasePointerOffset())
26583 .addRegMask(RI.getNoPreservedMask());
26585 BuildMI(DispatchBB, DL, TII->get(X86::NOOP))
26586 .addRegMask(RI.getNoPreservedMask());
26589 unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26590 addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
26592 BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
26594 .addImm(LPadList.size());
26595 BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
26597 unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
26598 BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
26601 BuildMI(DispContBB, DL,
26602 TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
26604 .addImm(Subtarget.is64Bit() ? 8 : 4)
26606 .addJumpTableIndex(MJTI)
26609 // Add the jump table entries as successors to the MBB.
26610 SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
26611 for (auto &LP : LPadList)
26612 if (SeenMBBs.insert(LP).second)
26613 DispContBB->addSuccessor(LP);
26615 // N.B. the order the invoke BBs are processed in doesn't matter here.
26616 SmallVector<MachineBasicBlock *, 64> MBBLPads;
26617 const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
26618 for (MachineBasicBlock *MBB : InvokeBBs) {
26619 // Remove the landing pad successor from the invoke block and replace it
26620 // with the new dispatch block.
26621 // Keep a copy of Successors since it's modified inside the loop.
26622 SmallVector<MachineBasicBlock *, 8> Successors(MBB->succ_rbegin(),
26624 // FIXME: Avoid quadratic complexity.
26625 for (auto MBBS : Successors) {
26626 if (MBBS->isEHPad()) {
26627 MBB->removeSuccessor(MBBS);
26628 MBBLPads.push_back(MBBS);
26632 MBB->addSuccessor(DispatchBB);
26634 // Find the invoke call and mark all of the callee-saved registers as
26635 // 'implicit defined' so that they're spilled. This prevents code from
26636 // moving instructions to before the EH block, where they will never be
26638 for (auto &II : reverse(*MBB)) {
26642 DenseMap<unsigned, bool> DefRegs;
26643 for (auto &MOp : II.operands())
26645 DefRegs[MOp.getReg()] = true;
26647 MachineInstrBuilder MIB(*MF, &II);
26648 for (unsigned RI = 0; SavedRegs[RI]; ++RI) {
26649 unsigned Reg = SavedRegs[RI];
26651 MIB.addReg(Reg, RegState::ImplicitDefine | RegState::Dead);
26658 // Mark all former landing pads as non-landing pads. The dispatch is the only
26659 // landing pad now.
26660 for (auto &LP : MBBLPads)
26661 LP->setIsEHPad(false);
26663 // The instruction is gone now.
26664 MI.eraseFromParent();
26668 MachineBasicBlock *
26669 X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
26670 MachineBasicBlock *BB) const {
26671 MachineFunction *MF = BB->getParent();
26672 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
26673 DebugLoc DL = MI.getDebugLoc();
26675 switch (MI.getOpcode()) {
26676 default: llvm_unreachable("Unexpected instr type to insert");
26677 case X86::TAILJMPd64:
26678 case X86::TAILJMPr64:
26679 case X86::TAILJMPm64:
26680 case X86::TAILJMPr64_REX:
26681 case X86::TAILJMPm64_REX:
26682 llvm_unreachable("TAILJMP64 would not be touched here.");
26683 case X86::TCRETURNdi64:
26684 case X86::TCRETURNri64:
26685 case X86::TCRETURNmi64:
26687 case X86::TLS_addr32:
26688 case X86::TLS_addr64:
26689 case X86::TLS_base_addr32:
26690 case X86::TLS_base_addr64:
26691 return EmitLoweredTLSAddr(MI, BB);
26692 case X86::CATCHRET:
26693 return EmitLoweredCatchRet(MI, BB);
26694 case X86::CATCHPAD:
26695 return EmitLoweredCatchPad(MI, BB);
26696 case X86::SEG_ALLOCA_32:
26697 case X86::SEG_ALLOCA_64:
26698 return EmitLoweredSegAlloca(MI, BB);
26699 case X86::TLSCall_32:
26700 case X86::TLSCall_64:
26701 return EmitLoweredTLSCall(MI, BB);
26702 case X86::CMOV_FR32:
26703 case X86::CMOV_FR64:
26704 case X86::CMOV_FR128:
26705 case X86::CMOV_GR8:
26706 case X86::CMOV_GR16:
26707 case X86::CMOV_GR32:
26708 case X86::CMOV_RFP32:
26709 case X86::CMOV_RFP64:
26710 case X86::CMOV_RFP80:
26711 case X86::CMOV_V2F64:
26712 case X86::CMOV_V2I64:
26713 case X86::CMOV_V4F32:
26714 case X86::CMOV_V4F64:
26715 case X86::CMOV_V4I64:
26716 case X86::CMOV_V16F32:
26717 case X86::CMOV_V8F32:
26718 case X86::CMOV_V8F64:
26719 case X86::CMOV_V8I64:
26720 case X86::CMOV_V8I1:
26721 case X86::CMOV_V16I1:
26722 case X86::CMOV_V32I1:
26723 case X86::CMOV_V64I1:
26724 return EmitLoweredSelect(MI, BB);
26726 case X86::RDFLAGS32:
26727 case X86::RDFLAGS64: {
26729 MI.getOpcode() == X86::RDFLAGS32 ? X86::PUSHF32 : X86::PUSHF64;
26730 unsigned Pop = MI.getOpcode() == X86::RDFLAGS32 ? X86::POP32r : X86::POP64r;
26731 MachineInstr *Push = BuildMI(*BB, MI, DL, TII->get(PushF));
26732 // Permit reads of the FLAGS register without it being defined.
26733 // This intrinsic exists to read external processor state in flags, such as
26734 // the trap flag, interrupt flag, and direction flag, none of which are
26735 // modeled by the backend.
26736 Push->getOperand(2).setIsUndef();
26737 BuildMI(*BB, MI, DL, TII->get(Pop), MI.getOperand(0).getReg());
26739 MI.eraseFromParent(); // The pseudo is gone now.
26743 case X86::WRFLAGS32:
26744 case X86::WRFLAGS64: {
26746 MI.getOpcode() == X86::WRFLAGS32 ? X86::PUSH32r : X86::PUSH64r;
26748 MI.getOpcode() == X86::WRFLAGS32 ? X86::POPF32 : X86::POPF64;
26749 BuildMI(*BB, MI, DL, TII->get(Push)).addReg(MI.getOperand(0).getReg());
26750 BuildMI(*BB, MI, DL, TII->get(PopF));
26752 MI.eraseFromParent(); // The pseudo is gone now.
26756 case X86::RELEASE_FADD32mr:
26757 case X86::RELEASE_FADD64mr:
26758 return EmitLoweredAtomicFP(MI, BB);
26760 case X86::FP32_TO_INT16_IN_MEM:
26761 case X86::FP32_TO_INT32_IN_MEM:
26762 case X86::FP32_TO_INT64_IN_MEM:
26763 case X86::FP64_TO_INT16_IN_MEM:
26764 case X86::FP64_TO_INT32_IN_MEM:
26765 case X86::FP64_TO_INT64_IN_MEM:
26766 case X86::FP80_TO_INT16_IN_MEM:
26767 case X86::FP80_TO_INT32_IN_MEM:
26768 case X86::FP80_TO_INT64_IN_MEM: {
26769 // Change the floating point control register to use "round towards zero"
26770 // mode when truncating to an integer value.
26771 int CWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
26772 addFrameReference(BuildMI(*BB, MI, DL,
26773 TII->get(X86::FNSTCW16m)), CWFrameIdx);
26775 // Load the old value of the high byte of the control word...
26777 MF->getRegInfo().createVirtualRegister(&X86::GR16RegClass);
26778 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16rm), OldCW),
26781 // Set the high part to be round to zero...
26782 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mi)), CWFrameIdx)
26785 // Reload the modified control word now...
26786 addFrameReference(BuildMI(*BB, MI, DL,
26787 TII->get(X86::FLDCW16m)), CWFrameIdx);
26789 // Restore the memory image of control word to original value
26790 addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)), CWFrameIdx)
26793 // Get the X86 opcode to use.
26795 switch (MI.getOpcode()) {
26796 default: llvm_unreachable("illegal opcode!");
26797 case X86::FP32_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m32; break;
26798 case X86::FP32_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m32; break;
26799 case X86::FP32_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m32; break;
26800 case X86::FP64_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m64; break;
26801 case X86::FP64_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m64; break;
26802 case X86::FP64_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m64; break;
26803 case X86::FP80_TO_INT16_IN_MEM: Opc = X86::IST_Fp16m80; break;
26804 case X86::FP80_TO_INT32_IN_MEM: Opc = X86::IST_Fp32m80; break;
26805 case X86::FP80_TO_INT64_IN_MEM: Opc = X86::IST_Fp64m80; break;
26808 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26809 addFullAddress(BuildMI(*BB, MI, DL, TII->get(Opc)), AM)
26810 .addReg(MI.getOperand(X86::AddrNumOperands).getReg());
26812 // Reload the original control word now.
26813 addFrameReference(BuildMI(*BB, MI, DL,
26814 TII->get(X86::FLDCW16m)), CWFrameIdx);
26816 MI.eraseFromParent(); // The pseudo instruction is gone now.
26819 // String/text processing lowering.
26820 case X86::PCMPISTRM128REG:
26821 case X86::VPCMPISTRM128REG:
26822 case X86::PCMPISTRM128MEM:
26823 case X86::VPCMPISTRM128MEM:
26824 case X86::PCMPESTRM128REG:
26825 case X86::VPCMPESTRM128REG:
26826 case X86::PCMPESTRM128MEM:
26827 case X86::VPCMPESTRM128MEM:
26828 assert(Subtarget.hasSSE42() &&
26829 "Target must have SSE4.2 or AVX features enabled");
26830 return emitPCMPSTRM(MI, BB, Subtarget.getInstrInfo());
26832 // String/text processing lowering.
26833 case X86::PCMPISTRIREG:
26834 case X86::VPCMPISTRIREG:
26835 case X86::PCMPISTRIMEM:
26836 case X86::VPCMPISTRIMEM:
26837 case X86::PCMPESTRIREG:
26838 case X86::VPCMPESTRIREG:
26839 case X86::PCMPESTRIMEM:
26840 case X86::VPCMPESTRIMEM:
26841 assert(Subtarget.hasSSE42() &&
26842 "Target must have SSE4.2 or AVX features enabled");
26843 return emitPCMPSTRI(MI, BB, Subtarget.getInstrInfo());
26845 // Thread synchronization.
26847 return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
26848 case X86::MONITORX:
26849 return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
26853 return emitClzero(&MI, BB, Subtarget);
26857 return emitWRPKRU(MI, BB, Subtarget);
26859 return emitRDPKRU(MI, BB, Subtarget);
26862 return emitXBegin(MI, BB, Subtarget.getInstrInfo());
26864 case X86::VASTART_SAVE_XMM_REGS:
26865 return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
26867 case X86::VAARG_64:
26868 return EmitVAARG64WithCustomInserter(MI, BB);
26870 case X86::EH_SjLj_SetJmp32:
26871 case X86::EH_SjLj_SetJmp64:
26872 return emitEHSjLjSetJmp(MI, BB);
26874 case X86::EH_SjLj_LongJmp32:
26875 case X86::EH_SjLj_LongJmp64:
26876 return emitEHSjLjLongJmp(MI, BB);
26878 case X86::Int_eh_sjlj_setup_dispatch:
26879 return EmitSjLjDispatchBlock(MI, BB);
26881 case TargetOpcode::STATEPOINT:
26882 // As an implementation detail, STATEPOINT shares the STACKMAP format at
26883 // this point in the process. We diverge later.
26884 return emitPatchPoint(MI, BB);
26886 case TargetOpcode::STACKMAP:
26887 case TargetOpcode::PATCHPOINT:
26888 return emitPatchPoint(MI, BB);
26890 case TargetOpcode::PATCHABLE_EVENT_CALL:
26891 // Do nothing here, handle in xray instrumentation pass.
26894 case X86::LCMPXCHG8B: {
26895 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
26896 // In addition to 4 E[ABCD] registers implied by encoding, CMPXCHG8B
26897 // requires a memory operand. If it happens that current architecture is
26898 // i686 and for current function we need a base pointer
26899 // - which is ESI for i686 - register allocator would not be able to
26900 // allocate registers for an address in form of X(%reg, %reg, Y)
26901 // - there never would be enough unreserved registers during regalloc
26902 // (without the need for base ptr the only option would be X(%edi, %esi, Y).
26903 // We are giving a hand to register allocator by precomputing the address in
26904 // a new vreg using LEA.
26906 // If it is not i686 or there is no base pointer - nothing to do here.
26907 if (!Subtarget.is32Bit() || !TRI->hasBasePointer(*MF))
26910 // Even though this code does not necessarily needs the base pointer to
26911 // be ESI, we check for that. The reason: if this assert fails, there are
26912 // some changes happened in the compiler base pointer handling, which most
26913 // probably have to be addressed somehow here.
26914 assert(TRI->getBaseRegister() == X86::ESI &&
26915 "LCMPXCHG8B custom insertion for i686 is written with X86::ESI as a "
26916 "base pointer in mind");
26918 MachineRegisterInfo &MRI = MF->getRegInfo();
26919 MVT SPTy = getPointerTy(MF->getDataLayout());
26920 const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
26921 unsigned computedAddrVReg = MRI.createVirtualRegister(AddrRegClass);
26923 X86AddressMode AM = getAddressFromInstr(&MI, 0);
26924 // Regalloc does not need any help when the memory operand of CMPXCHG8B
26925 // does not use index register.
26926 if (AM.IndexReg == X86::NoRegister)
26929 // After X86TargetLowering::ReplaceNodeResults CMPXCHG8B is glued to its
26930 // four operand definitions that are E[ABCD] registers. We skip them and
26931 // then insert the LEA.
26932 MachineBasicBlock::iterator MBBI(MI);
26933 while (MBBI->definesRegister(X86::EAX) || MBBI->definesRegister(X86::EBX) ||
26934 MBBI->definesRegister(X86::ECX) || MBBI->definesRegister(X86::EDX))
26937 BuildMI(*BB, *MBBI, DL, TII->get(X86::LEA32r), computedAddrVReg), AM);
26939 setDirectAddressInInstr(&MI, 0, computedAddrVReg);
26943 case X86::LCMPXCHG16B:
26945 case X86::LCMPXCHG8B_SAVE_EBX:
26946 case X86::LCMPXCHG16B_SAVE_RBX: {
26948 MI.getOpcode() == X86::LCMPXCHG8B_SAVE_EBX ? X86::EBX : X86::RBX;
26949 if (!BB->isLiveIn(BasePtr))
26950 BB->addLiveIn(BasePtr);
26956 //===----------------------------------------------------------------------===//
26957 // X86 Optimization Hooks
26958 //===----------------------------------------------------------------------===//
26960 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
26962 const APInt &DemandedElts,
26963 const SelectionDAG &DAG,
26964 unsigned Depth) const {
26965 unsigned BitWidth = Known.getBitWidth();
26966 unsigned Opc = Op.getOpcode();
26967 EVT VT = Op.getValueType();
26968 assert((Opc >= ISD::BUILTIN_OP_END ||
26969 Opc == ISD::INTRINSIC_WO_CHAIN ||
26970 Opc == ISD::INTRINSIC_W_CHAIN ||
26971 Opc == ISD::INTRINSIC_VOID) &&
26972 "Should use MaskedValueIsZero if you don't know whether Op"
26973 " is a target node!");
26989 // These nodes' second result is a boolean.
26990 if (Op.getResNo() == 0)
26993 case X86ISD::SETCC:
26994 Known.Zero.setBitsFrom(1);
26996 case X86ISD::MOVMSK: {
26997 unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
26998 Known.Zero.setBitsFrom(NumLoBits);
27001 case X86ISD::VSHLI:
27002 case X86ISD::VSRLI: {
27003 if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
27004 if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
27005 Known.setAllZero();
27009 DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
27010 unsigned ShAmt = ShiftImm->getZExtValue();
27011 if (Opc == X86ISD::VSHLI) {
27012 Known.Zero <<= ShAmt;
27013 Known.One <<= ShAmt;
27014 // Low bits are known zero.
27015 Known.Zero.setLowBits(ShAmt);
27017 Known.Zero.lshrInPlace(ShAmt);
27018 Known.One.lshrInPlace(ShAmt);
27019 // High bits are known zero.
27020 Known.Zero.setHighBits(ShAmt);
27025 case X86ISD::VZEXT: {
27026 SDValue N0 = Op.getOperand(0);
27027 unsigned NumElts = VT.getVectorNumElements();
27029 EVT SrcVT = N0.getValueType();
27030 unsigned InNumElts = SrcVT.getVectorNumElements();
27031 unsigned InBitWidth = SrcVT.getScalarSizeInBits();
27032 assert(InNumElts >= NumElts && "Illegal VZEXT input");
27034 Known = KnownBits(InBitWidth);
27035 APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
27036 DAG.computeKnownBits(N0, Known, DemandedSrcElts, Depth + 1);
27037 Known = Known.zext(BitWidth);
27038 Known.Zero.setBitsFrom(InBitWidth);
27044 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
27045 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
27046 unsigned Depth) const {
27047 unsigned VTBits = Op.getScalarValueSizeInBits();
27048 unsigned Opcode = Op.getOpcode();
27050 case X86ISD::SETCC_CARRY:
27051 // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
27054 case X86ISD::VSEXT: {
27055 SDValue Src = Op.getOperand(0);
27056 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27057 Tmp += VTBits - Src.getScalarValueSizeInBits();
27061 case X86ISD::VSHLI: {
27062 SDValue Src = Op.getOperand(0);
27063 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27064 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27065 if (ShiftVal.uge(VTBits))
27066 return VTBits; // Shifted all bits out --> zero.
27067 if (ShiftVal.uge(Tmp))
27068 return 1; // Shifted all sign bits out --> unknown.
27069 return Tmp - ShiftVal.getZExtValue();
27072 case X86ISD::VSRAI: {
27073 SDValue Src = Op.getOperand(0);
27074 unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
27075 APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
27077 return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
27080 case X86ISD::PCMPGT:
27081 case X86ISD::PCMPEQ:
27083 case X86ISD::VPCOM:
27084 case X86ISD::VPCOMU:
27085 // Vector compares return zero/all-bits result values.
27093 /// Returns true (and the GlobalValue and the offset) if the node is a
27094 /// GlobalAddress + offset.
27095 bool X86TargetLowering::isGAPlusOffset(SDNode *N,
27096 const GlobalValue* &GA,
27097 int64_t &Offset) const {
27098 if (N->getOpcode() == X86ISD::Wrapper) {
27099 if (isa<GlobalAddressSDNode>(N->getOperand(0))) {
27100 GA = cast<GlobalAddressSDNode>(N->getOperand(0))->getGlobal();
27101 Offset = cast<GlobalAddressSDNode>(N->getOperand(0))->getOffset();
27105 return TargetLowering::isGAPlusOffset(N, GA, Offset);
27108 // Attempt to match a combined shuffle mask against supported unary shuffle
27110 // TODO: Investigate sharing more of this with shuffle lowering.
27111 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27112 bool AllowFloatDomain, bool AllowIntDomain,
27113 SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
27114 const X86Subtarget &Subtarget,
27115 unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
27116 unsigned NumMaskElts = Mask.size();
27117 unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
27119 // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
27120 // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
27121 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
27122 (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
27123 unsigned MaxScale = 64 / MaskEltSize;
27124 for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
27126 unsigned NumDstElts = NumMaskElts / Scale;
27127 for (unsigned i = 0; i != NumDstElts && Match; ++i) {
27128 Match &= isUndefOrEqual(Mask[i * Scale], (int)i);
27129 Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
27132 unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
27133 SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
27134 if (SrcVT != MaskVT)
27135 V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
27136 DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
27137 DstVT = MVT::getVectorVT(DstVT, NumDstElts);
27138 Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
27139 : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
27145 // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
27146 if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
27147 isUndefOrEqual(Mask[0], 0) &&
27148 isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
27149 Shuffle = X86ISD::VZEXT_MOVL;
27150 SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
27154 // Check if we have SSE3 which will let us use MOVDDUP etc. The
27155 // instructions are no slower than UNPCKLPD but has the option to
27156 // fold the input operand into even an unaligned memory load.
27157 if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
27158 if (isTargetShuffleEquivalent(Mask, {0, 0})) {
27159 Shuffle = X86ISD::MOVDDUP;
27160 SrcVT = DstVT = MVT::v2f64;
27163 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27164 Shuffle = X86ISD::MOVSLDUP;
27165 SrcVT = DstVT = MVT::v4f32;
27168 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3})) {
27169 Shuffle = X86ISD::MOVSHDUP;
27170 SrcVT = DstVT = MVT::v4f32;
27175 if (MaskVT.is256BitVector() && AllowFloatDomain) {
27176 assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
27177 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
27178 Shuffle = X86ISD::MOVDDUP;
27179 SrcVT = DstVT = MVT::v4f64;
27182 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27183 Shuffle = X86ISD::MOVSLDUP;
27184 SrcVT = DstVT = MVT::v8f32;
27187 if (isTargetShuffleEquivalent(Mask, {1, 1, 3, 3, 5, 5, 7, 7})) {
27188 Shuffle = X86ISD::MOVSHDUP;
27189 SrcVT = DstVT = MVT::v8f32;
27194 if (MaskVT.is512BitVector() && AllowFloatDomain) {
27195 assert(Subtarget.hasAVX512() &&
27196 "AVX512 required for 512-bit vector shuffles");
27197 if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
27198 Shuffle = X86ISD::MOVDDUP;
27199 SrcVT = DstVT = MVT::v8f64;
27202 if (isTargetShuffleEquivalent(
27203 Mask, {0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14})) {
27204 Shuffle = X86ISD::MOVSLDUP;
27205 SrcVT = DstVT = MVT::v16f32;
27208 if (isTargetShuffleEquivalent(
27209 Mask, {1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15})) {
27210 Shuffle = X86ISD::MOVSHDUP;
27211 SrcVT = DstVT = MVT::v16f32;
27216 // Attempt to match against broadcast-from-vector.
27217 if (Subtarget.hasAVX2()) {
27218 SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
27219 if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
27220 SrcVT = DstVT = MaskVT;
27221 Shuffle = X86ISD::VBROADCAST;
27229 // Attempt to match a combined shuffle mask against supported unary immediate
27230 // permute instructions.
27231 // TODO: Investigate sharing more of this with shuffle lowering.
27232 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27233 const APInt &Zeroable,
27234 bool AllowFloatDomain,
27235 bool AllowIntDomain,
27236 const X86Subtarget &Subtarget,
27237 unsigned &Shuffle, MVT &ShuffleVT,
27238 unsigned &PermuteImm) {
27239 unsigned NumMaskElts = Mask.size();
27240 unsigned InputSizeInBits = MaskVT.getSizeInBits();
27241 unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
27242 MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
27244 bool ContainsZeros =
27245 llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27247 // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
27248 if (!ContainsZeros && MaskScalarSizeInBits == 64) {
27249 // Check for lane crossing permutes.
27250 if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
27251 // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
27252 if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) {
27253 Shuffle = X86ISD::VPERMI;
27254 ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
27255 PermuteImm = getV4X86ShuffleImm(Mask);
27258 if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) {
27259 SmallVector<int, 4> RepeatedMask;
27260 if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
27261 Shuffle = X86ISD::VPERMI;
27262 ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
27263 PermuteImm = getV4X86ShuffleImm(RepeatedMask);
27267 } else if (AllowFloatDomain && Subtarget.hasAVX()) {
27268 // VPERMILPD can permute with a non-repeating shuffle.
27269 Shuffle = X86ISD::VPERMILPI;
27270 ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
27272 for (int i = 0, e = Mask.size(); i != e; ++i) {
27274 if (M == SM_SentinelUndef)
27276 assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index");
27277 PermuteImm |= (M & 1) << i;
27283 // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns.
27284 // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
27285 // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
27286 if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) &&
27287 !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) {
27288 SmallVector<int, 4> RepeatedMask;
27289 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27290 // Narrow the repeated mask to create 32-bit element permutes.
27291 SmallVector<int, 4> WordMask = RepeatedMask;
27292 if (MaskScalarSizeInBits == 64)
27293 scaleShuffleMask(2, RepeatedMask, WordMask);
27295 Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
27296 ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
27297 ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
27298 PermuteImm = getV4X86ShuffleImm(WordMask);
27303 // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns.
27304 if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) {
27305 SmallVector<int, 4> RepeatedMask;
27306 if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) {
27307 ArrayRef<int> LoMask(Mask.data() + 0, 4);
27308 ArrayRef<int> HiMask(Mask.data() + 4, 4);
27310 // PSHUFLW: permute lower 4 elements only.
27311 if (isUndefOrInRange(LoMask, 0, 4) &&
27312 isSequentialOrUndefInRange(HiMask, 0, 4, 4)) {
27313 Shuffle = X86ISD::PSHUFLW;
27314 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27315 PermuteImm = getV4X86ShuffleImm(LoMask);
27319 // PSHUFHW: permute upper 4 elements only.
27320 if (isUndefOrInRange(HiMask, 4, 8) &&
27321 isSequentialOrUndefInRange(LoMask, 0, 4, 0)) {
27322 // Offset the HiMask so that we can create the shuffle immediate.
27323 int OffsetHiMask[4];
27324 for (int i = 0; i != 4; ++i)
27325 OffsetHiMask[i] = (HiMask[i] < 0 ? HiMask[i] : HiMask[i] - 4);
27327 Shuffle = X86ISD::PSHUFHW;
27328 ShuffleVT = MVT::getVectorVT(MVT::i16, InputSizeInBits / 16);
27329 PermuteImm = getV4X86ShuffleImm(OffsetHiMask);
27335 // Attempt to match against byte/bit shifts.
27336 // FIXME: Add 512-bit support.
27337 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27338 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27339 int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
27340 MaskScalarSizeInBits, Mask,
27341 0, Zeroable, Subtarget);
27342 if (0 < ShiftAmt) {
27343 PermuteImm = (unsigned)ShiftAmt;
27351 // Attempt to match a combined unary shuffle mask against supported binary
27352 // shuffle instructions.
27353 // TODO: Investigate sharing more of this with shuffle lowering.
27354 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27355 bool AllowFloatDomain, bool AllowIntDomain,
27356 SDValue &V1, SDValue &V2, SDLoc &DL,
27358 const X86Subtarget &Subtarget,
27359 unsigned &Shuffle, MVT &ShuffleVT,
27361 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27363 if (MaskVT.is128BitVector()) {
27364 if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
27366 Shuffle = X86ISD::MOVLHPS;
27367 ShuffleVT = MVT::v4f32;
27370 if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
27372 Shuffle = X86ISD::MOVHLPS;
27373 ShuffleVT = MVT::v4f32;
27376 if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
27377 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27379 Shuffle = X86ISD::MOVSD;
27380 ShuffleVT = MaskVT;
27383 if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
27384 (AllowFloatDomain || !Subtarget.hasSSE41())) {
27385 Shuffle = X86ISD::MOVSS;
27386 ShuffleVT = MaskVT;
27391 // Attempt to match against either a unary or binary UNPCKL/UNPCKH shuffle.
27392 if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
27393 (MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27394 (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
27395 (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
27396 (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
27397 if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
27399 ShuffleVT = MaskVT;
27400 if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
27401 ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
27409 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
27410 const APInt &Zeroable,
27411 bool AllowFloatDomain,
27412 bool AllowIntDomain,
27413 SDValue &V1, SDValue &V2, SDLoc &DL,
27415 const X86Subtarget &Subtarget,
27416 unsigned &Shuffle, MVT &ShuffleVT,
27417 unsigned &PermuteImm) {
27418 unsigned NumMaskElts = Mask.size();
27419 unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
27421 // Attempt to match against PALIGNR byte rotate.
27422 if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
27423 (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
27424 int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
27425 if (0 < ByteRotation) {
27426 Shuffle = X86ISD::PALIGNR;
27427 ShuffleVT = MVT::getVectorVT(MVT::i8, MaskVT.getSizeInBits() / 8);
27428 PermuteImm = ByteRotation;
27433 // Attempt to combine to X86ISD::BLENDI.
27434 if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
27435 (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
27436 (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
27437 uint64_t BlendMask = 0;
27438 bool ForceV1Zero = false, ForceV2Zero = false;
27439 SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
27440 if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
27442 if (MaskVT == MVT::v16i16) {
27443 // We can only use v16i16 PBLENDW if the lanes are repeated.
27444 SmallVector<int, 8> RepeatedMask;
27445 if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
27447 assert(RepeatedMask.size() == 8 &&
27448 "Repeated mask size doesn't match!");
27450 for (int i = 0; i < 8; ++i)
27451 if (RepeatedMask[i] >= 8)
27452 PermuteImm |= 1 << i;
27453 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27454 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27455 Shuffle = X86ISD::BLENDI;
27456 ShuffleVT = MaskVT;
27460 // Determine a type compatible with X86ISD::BLENDI.
27461 ShuffleVT = MaskVT;
27462 if (Subtarget.hasAVX2()) {
27463 if (ShuffleVT == MVT::v4i64)
27464 ShuffleVT = MVT::v8i32;
27465 else if (ShuffleVT == MVT::v2i64)
27466 ShuffleVT = MVT::v4i32;
27468 if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
27469 ShuffleVT = MVT::v8i16;
27470 else if (ShuffleVT == MVT::v4i64)
27471 ShuffleVT = MVT::v4f64;
27472 else if (ShuffleVT == MVT::v8i32)
27473 ShuffleVT = MVT::v8f32;
27476 if (!ShuffleVT.isFloatingPoint()) {
27477 int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
27479 scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
27480 ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
27481 ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
27484 V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
27485 V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
27486 PermuteImm = (unsigned)BlendMask;
27487 Shuffle = X86ISD::BLENDI;
27493 // Attempt to combine to INSERTPS.
27494 if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
27495 MaskVT.is128BitVector()) {
27496 if (Zeroable.getBoolValue() &&
27497 matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
27498 Shuffle = X86ISD::INSERTPS;
27499 ShuffleVT = MVT::v4f32;
27504 // Attempt to combine to SHUFPD.
27505 if (AllowFloatDomain && EltSizeInBits == 64 &&
27506 ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
27507 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27508 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27509 if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
27510 Shuffle = X86ISD::SHUFP;
27511 ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
27516 // Attempt to combine to SHUFPS.
27517 if (AllowFloatDomain && EltSizeInBits == 32 &&
27518 ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
27519 (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
27520 (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
27521 SmallVector<int, 4> RepeatedMask;
27522 if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
27523 // Match each half of the repeated mask, to determine if its just
27524 // referencing one of the vectors, is zeroable or entirely undef.
27525 auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
27526 int M0 = RepeatedMask[Offset];
27527 int M1 = RepeatedMask[Offset + 1];
27529 if (isUndefInRange(RepeatedMask, Offset, 2)) {
27530 return DAG.getUNDEF(MaskVT);
27531 } else if (isUndefOrZeroInRange(RepeatedMask, Offset, 2)) {
27532 S0 = (SM_SentinelUndef == M0 ? -1 : 0);
27533 S1 = (SM_SentinelUndef == M1 ? -1 : 1);
27534 return getZeroVector(MaskVT, Subtarget, DAG, DL);
27535 } else if (isUndefOrInRange(M0, 0, 4) && isUndefOrInRange(M1, 0, 4)) {
27536 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27537 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27539 } else if (isUndefOrInRange(M0, 4, 8) && isUndefOrInRange(M1, 4, 8)) {
27540 S0 = (SM_SentinelUndef == M0 ? -1 : M0 & 3);
27541 S1 = (SM_SentinelUndef == M1 ? -1 : M1 & 3);
27548 int ShufMask[4] = {-1, -1, -1, -1};
27549 SDValue Lo = MatchHalf(0, ShufMask[0], ShufMask[1]);
27550 SDValue Hi = MatchHalf(2, ShufMask[2], ShufMask[3]);
27555 Shuffle = X86ISD::SHUFP;
27556 ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
27557 PermuteImm = getV4X86ShuffleImm(ShufMask);
27566 /// \brief Combine an arbitrary chain of shuffles into a single instruction if
27569 /// This is the leaf of the recursive combine below. When we have found some
27570 /// chain of single-use x86 shuffle instructions and accumulated the combined
27571 /// shuffle mask represented by them, this will try to pattern match that mask
27572 /// into either a single instruction if there is a special purpose instruction
27573 /// for this operation, or into a PSHUFB instruction which is a fully general
27574 /// instruction but should only be used to replace chains over a certain depth.
27575 static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
27576 ArrayRef<int> BaseMask, int Depth,
27577 bool HasVariableMask, SelectionDAG &DAG,
27578 TargetLowering::DAGCombinerInfo &DCI,
27579 const X86Subtarget &Subtarget) {
27580 assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
27581 assert((Inputs.size() == 1 || Inputs.size() == 2) &&
27582 "Unexpected number of shuffle inputs!");
27584 // Find the inputs that enter the chain. Note that multiple uses are OK
27585 // here, we're not going to remove the operands we find.
27586 bool UnaryShuffle = (Inputs.size() == 1);
27587 SDValue V1 = peekThroughBitcasts(Inputs[0]);
27588 SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
27589 : peekThroughBitcasts(Inputs[1]));
27591 MVT VT1 = V1.getSimpleValueType();
27592 MVT VT2 = V2.getSimpleValueType();
27593 MVT RootVT = Root.getSimpleValueType();
27594 assert(VT1.getSizeInBits() == RootVT.getSizeInBits() &&
27595 VT2.getSizeInBits() == RootVT.getSizeInBits() &&
27596 "Vector size mismatch");
27601 unsigned NumBaseMaskElts = BaseMask.size();
27602 if (NumBaseMaskElts == 1) {
27603 assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
27604 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27609 unsigned RootSizeInBits = RootVT.getSizeInBits();
27610 unsigned NumRootElts = RootVT.getVectorNumElements();
27611 unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
27612 bool FloatDomain = VT1.isFloatingPoint() || VT2.isFloatingPoint() ||
27613 (RootVT.is256BitVector() && !Subtarget.hasAVX2());
27615 // Don't combine if we are a AVX512/EVEX target and the mask element size
27616 // is different from the root element size - this would prevent writemasks
27617 // from being reused.
27618 // TODO - this currently prevents all lane shuffles from occurring.
27619 // TODO - check for writemasks usage instead of always preventing combining.
27620 // TODO - attempt to narrow Mask back to writemask size.
27621 bool IsEVEXShuffle =
27622 RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
27623 if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
27626 // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
27628 // Handle 128-bit lane shuffles of 256-bit vectors.
27629 // TODO - this should support binary shuffles.
27630 if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
27631 !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
27632 if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
27633 return false; // Nothing to do!
27634 MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
27635 unsigned PermMask = 0;
27636 PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
27637 PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
27639 Res = DAG.getBitcast(ShuffleVT, V1);
27640 DCI.AddToWorklist(Res.getNode());
27641 Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
27642 DAG.getUNDEF(ShuffleVT),
27643 DAG.getConstant(PermMask, DL, MVT::i8));
27644 DCI.AddToWorklist(Res.getNode());
27645 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27650 // For masks that have been widened to 128-bit elements or more,
27651 // narrow back down to 64-bit elements.
27652 SmallVector<int, 64> Mask;
27653 if (BaseMaskEltSizeInBits > 64) {
27654 assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
27655 int MaskScale = BaseMaskEltSizeInBits / 64;
27656 scaleShuffleMask(MaskScale, BaseMask, Mask);
27658 Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
27661 unsigned NumMaskElts = Mask.size();
27662 unsigned MaskEltSizeInBits = RootSizeInBits / NumMaskElts;
27664 // Determine the effective mask value type.
27665 FloatDomain &= (32 <= MaskEltSizeInBits);
27666 MVT MaskVT = FloatDomain ? MVT::getFloatingPointVT(MaskEltSizeInBits)
27667 : MVT::getIntegerVT(MaskEltSizeInBits);
27668 MaskVT = MVT::getVectorVT(MaskVT, NumMaskElts);
27670 // Only allow legal mask types.
27671 if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
27674 // Attempt to match the mask against known shuffle patterns.
27675 MVT ShuffleSrcVT, ShuffleVT;
27676 unsigned Shuffle, PermuteImm;
27678 // Which shuffle domains are permitted?
27679 // Permit domain crossing at higher combine depths.
27680 bool AllowFloatDomain = FloatDomain || (Depth > 3);
27681 bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
27682 (!MaskVT.is256BitVector() || Subtarget.hasAVX2());
27684 // Determine zeroable mask elements.
27685 APInt Zeroable(NumMaskElts, 0);
27686 for (unsigned i = 0; i != NumMaskElts; ++i)
27687 if (isUndefOrZero(Mask[i]))
27688 Zeroable.setBit(i);
27690 if (UnaryShuffle) {
27691 // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
27692 // directly if we don't shuffle the lower element and we shuffle the upper
27693 // (zero) elements within themselves.
27694 if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
27695 (V1.getScalarValueSizeInBits() % MaskEltSizeInBits) == 0) {
27696 unsigned Scale = V1.getScalarValueSizeInBits() / MaskEltSizeInBits;
27697 ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
27698 if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
27699 isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
27700 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
27706 if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27707 V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
27709 if (Depth == 1 && Root.getOpcode() == Shuffle)
27710 return false; // Nothing to do!
27711 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27712 return false; // AVX512 Writemask clash.
27713 Res = DAG.getBitcast(ShuffleSrcVT, V1);
27714 DCI.AddToWorklist(Res.getNode());
27715 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
27716 DCI.AddToWorklist(Res.getNode());
27717 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27722 if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27723 AllowIntDomain, Subtarget, Shuffle,
27724 ShuffleVT, PermuteImm)) {
27725 if (Depth == 1 && Root.getOpcode() == Shuffle)
27726 return false; // Nothing to do!
27727 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27728 return false; // AVX512 Writemask clash.
27729 Res = DAG.getBitcast(ShuffleVT, V1);
27730 DCI.AddToWorklist(Res.getNode());
27731 Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
27732 DAG.getConstant(PermuteImm, DL, MVT::i8));
27733 DCI.AddToWorklist(Res.getNode());
27734 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27740 if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
27741 V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
27743 if (Depth == 1 && Root.getOpcode() == Shuffle)
27744 return false; // Nothing to do!
27745 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27746 return false; // AVX512 Writemask clash.
27747 V1 = DAG.getBitcast(ShuffleVT, V1);
27748 DCI.AddToWorklist(V1.getNode());
27749 V2 = DAG.getBitcast(ShuffleVT, V2);
27750 DCI.AddToWorklist(V2.getNode());
27751 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
27752 DCI.AddToWorklist(Res.getNode());
27753 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27758 if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
27759 AllowIntDomain, V1, V2, DL, DAG,
27760 Subtarget, Shuffle, ShuffleVT,
27762 if (Depth == 1 && Root.getOpcode() == Shuffle)
27763 return false; // Nothing to do!
27764 if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
27765 return false; // AVX512 Writemask clash.
27766 V1 = DAG.getBitcast(ShuffleVT, V1);
27767 DCI.AddToWorklist(V1.getNode());
27768 V2 = DAG.getBitcast(ShuffleVT, V2);
27769 DCI.AddToWorklist(V2.getNode());
27770 Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
27771 DAG.getConstant(PermuteImm, DL, MVT::i8));
27772 DCI.AddToWorklist(Res.getNode());
27773 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27778 // Typically from here on, we need an integer version of MaskVT.
27779 MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits);
27780 IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts);
27782 // Annoyingly, SSE4A instructions don't map into the above match helpers.
27783 if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) {
27784 uint64_t BitLen, BitIdx;
27785 if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
27787 if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
27788 return false; // Nothing to do!
27789 V1 = DAG.getBitcast(IntMaskVT, V1);
27790 DCI.AddToWorklist(V1.getNode());
27791 Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
27792 DAG.getConstant(BitLen, DL, MVT::i8),
27793 DAG.getConstant(BitIdx, DL, MVT::i8));
27794 DCI.AddToWorklist(Res.getNode());
27795 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27800 if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
27801 if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
27802 return false; // Nothing to do!
27803 V1 = DAG.getBitcast(IntMaskVT, V1);
27804 DCI.AddToWorklist(V1.getNode());
27805 V2 = DAG.getBitcast(IntMaskVT, V2);
27806 DCI.AddToWorklist(V2.getNode());
27807 Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2,
27808 DAG.getConstant(BitLen, DL, MVT::i8),
27809 DAG.getConstant(BitIdx, DL, MVT::i8));
27810 DCI.AddToWorklist(Res.getNode());
27811 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27817 // Don't try to re-form single instruction chains under any circumstances now
27818 // that we've done encoding canonicalization for them.
27822 bool MaskContainsZeros =
27823 any_of(Mask, [](int M) { return M == SM_SentinelZero; });
27825 if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
27826 // If we have a single input lane-crossing shuffle then lower to VPERMV.
27827 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27828 ((Subtarget.hasAVX2() &&
27829 (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27830 (Subtarget.hasAVX512() &&
27831 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27832 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27833 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27834 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27835 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27836 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27837 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27838 DCI.AddToWorklist(VPermMask.getNode());
27839 Res = DAG.getBitcast(MaskVT, V1);
27840 DCI.AddToWorklist(Res.getNode());
27841 Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
27842 DCI.AddToWorklist(Res.getNode());
27843 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27848 // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
27849 // vector as the second source.
27850 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
27851 ((Subtarget.hasAVX512() &&
27852 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27853 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27854 (Subtarget.hasVLX() &&
27855 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27856 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27857 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27858 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27859 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27860 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27861 // Adjust shuffle mask - replace SM_SentinelZero with second source index.
27862 for (unsigned i = 0; i != NumMaskElts; ++i)
27863 if (Mask[i] == SM_SentinelZero)
27864 Mask[i] = NumMaskElts + i;
27866 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27867 DCI.AddToWorklist(VPermMask.getNode());
27868 Res = DAG.getBitcast(MaskVT, V1);
27869 DCI.AddToWorklist(Res.getNode());
27870 SDValue Zero = getZeroVector(MaskVT, Subtarget, DAG, DL);
27871 DCI.AddToWorklist(Zero.getNode());
27872 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
27873 DCI.AddToWorklist(Res.getNode());
27874 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27879 // If we have a dual input lane-crossing shuffle then lower to VPERMV3.
27880 if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
27881 ((Subtarget.hasAVX512() &&
27882 (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
27883 MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
27884 (Subtarget.hasVLX() &&
27885 (MaskVT == MVT::v4f64 || MaskVT == MVT::v4i64 ||
27886 MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
27887 (Subtarget.hasBWI() && MaskVT == MVT::v32i16) ||
27888 (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) ||
27889 (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) ||
27890 (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) {
27891 SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true);
27892 DCI.AddToWorklist(VPermMask.getNode());
27893 V1 = DAG.getBitcast(MaskVT, V1);
27894 DCI.AddToWorklist(V1.getNode());
27895 V2 = DAG.getBitcast(MaskVT, V2);
27896 DCI.AddToWorklist(V2.getNode());
27897 Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
27898 DCI.AddToWorklist(Res.getNode());
27899 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27906 // See if we can combine a single input shuffle with zeros to a bit-mask,
27907 // which is much simpler than any shuffle.
27908 if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
27909 isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
27910 DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
27911 APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
27912 APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
27913 APInt UndefElts(NumMaskElts, 0);
27914 SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
27915 for (unsigned i = 0; i != NumMaskElts; ++i) {
27917 if (M == SM_SentinelUndef) {
27918 UndefElts.setBit(i);
27921 if (M == SM_SentinelZero)
27923 EltBits[i] = AllOnes;
27925 SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
27926 DCI.AddToWorklist(BitMask.getNode());
27927 Res = DAG.getBitcast(MaskVT, V1);
27928 DCI.AddToWorklist(Res.getNode());
27929 unsigned AndOpcode =
27930 FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
27931 Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
27932 DCI.AddToWorklist(Res.getNode());
27933 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27938 // If we have a single input shuffle with different shuffle patterns in the
27939 // the 128-bit lanes use the variable mask to VPERMILPS.
27940 // TODO Combine other mask types at higher depths.
27941 if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
27942 ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
27943 (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
27944 SmallVector<SDValue, 16> VPermIdx;
27945 for (int M : Mask) {
27947 M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32);
27948 VPermIdx.push_back(Idx);
27950 SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx);
27951 DCI.AddToWorklist(VPermMask.getNode());
27952 Res = DAG.getBitcast(MaskVT, V1);
27953 DCI.AddToWorklist(Res.getNode());
27954 Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
27955 DCI.AddToWorklist(Res.getNode());
27956 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
27961 // With XOP, binary shuffles of 128/256-bit floating point vectors can combine
27962 // to VPERMIL2PD/VPERMIL2PS.
27963 if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
27964 (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
27965 MaskVT == MVT::v8f32)) {
27966 // VPERMIL2 Operation.
27967 // Bits[3] - Match Bit.
27968 // Bits[2:1] - (Per Lane) PD Shuffle Mask.
27969 // Bits[2:0] - (Per Lane) PS Shuffle Mask.
27970 unsigned NumLanes = MaskVT.getSizeInBits() / 128;
27971 unsigned NumEltsPerLane = NumMaskElts / NumLanes;
27972 SmallVector<int, 8> VPerm2Idx;
27973 unsigned M2ZImm = 0;
27974 for (int M : Mask) {
27975 if (M == SM_SentinelUndef) {
27976 VPerm2Idx.push_back(-1);
27979 if (M == SM_SentinelZero) {
27981 VPerm2Idx.push_back(8);
27984 int Index = (M % NumEltsPerLane) + ((M / NumMaskElts) * NumEltsPerLane);
27985 Index = (MaskVT.getScalarSizeInBits() == 64 ? Index << 1 : Index);
27986 VPerm2Idx.push_back(Index);
27988 V1 = DAG.getBitcast(MaskVT, V1);
27989 DCI.AddToWorklist(V1.getNode());
27990 V2 = DAG.getBitcast(MaskVT, V2);
27991 DCI.AddToWorklist(V2.getNode());
27992 SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true);
27993 DCI.AddToWorklist(VPerm2MaskOp.getNode());
27994 Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
27995 DAG.getConstant(M2ZImm, DL, MVT::i8));
27996 DCI.AddToWorklist(Res.getNode());
27997 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28002 // If we have 3 or more shuffle instructions or a chain involving a variable
28003 // mask, we can replace them with a single PSHUFB instruction profitably.
28004 // Intel's manuals suggest only using PSHUFB if doing so replacing 5
28005 // instructions, but in practice PSHUFB tends to be *very* fast so we're
28006 // more aggressive.
28007 if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
28008 ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
28009 (RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
28010 (RootVT.is512BitVector() && Subtarget.hasBWI()))) {
28011 SmallVector<SDValue, 16> PSHUFBMask;
28012 int NumBytes = RootVT.getSizeInBits() / 8;
28013 int Ratio = NumBytes / NumMaskElts;
28014 for (int i = 0; i < NumBytes; ++i) {
28015 int M = Mask[i / Ratio];
28016 if (M == SM_SentinelUndef) {
28017 PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8));
28020 if (M == SM_SentinelZero) {
28021 PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
28024 M = Ratio * M + i % Ratio;
28025 assert ((M / 16) == (i / 16) && "Lane crossing detected");
28026 PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28028 MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
28029 Res = DAG.getBitcast(ByteVT, V1);
28030 DCI.AddToWorklist(Res.getNode());
28031 SDValue PSHUFBMaskOp = DAG.getBuildVector(ByteVT, DL, PSHUFBMask);
28032 DCI.AddToWorklist(PSHUFBMaskOp.getNode());
28033 Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
28034 DCI.AddToWorklist(Res.getNode());
28035 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28040 // With XOP, if we have a 128-bit binary input shuffle we can always combine
28041 // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
28042 // slower than PSHUFB on targets that support both.
28043 if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
28044 Subtarget.hasXOP()) {
28045 // VPPERM Mask Operation
28046 // Bits[4:0] - Byte Index (0 - 31)
28047 // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
28048 SmallVector<SDValue, 16> VPPERMMask;
28050 int Ratio = NumBytes / NumMaskElts;
28051 for (int i = 0; i < NumBytes; ++i) {
28052 int M = Mask[i / Ratio];
28053 if (M == SM_SentinelUndef) {
28054 VPPERMMask.push_back(DAG.getUNDEF(MVT::i8));
28057 if (M == SM_SentinelZero) {
28058 VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
28061 M = Ratio * M + i % Ratio;
28062 VPPERMMask.push_back(DAG.getConstant(M, DL, MVT::i8));
28064 MVT ByteVT = MVT::v16i8;
28065 V1 = DAG.getBitcast(ByteVT, V1);
28066 DCI.AddToWorklist(V1.getNode());
28067 V2 = DAG.getBitcast(ByteVT, V2);
28068 DCI.AddToWorklist(V2.getNode());
28069 SDValue VPPERMMaskOp = DAG.getBuildVector(ByteVT, DL, VPPERMMask);
28070 DCI.AddToWorklist(VPPERMMaskOp.getNode());
28071 Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
28072 DCI.AddToWorklist(Res.getNode());
28073 DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
28078 // Failed to find any combines.
28082 // Attempt to constant fold all of the constant source ops.
28083 // Returns true if the entire shuffle is folded to a constant.
28084 // TODO: Extend this to merge multiple constant Ops and update the mask.
28085 static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
28086 ArrayRef<int> Mask, SDValue Root,
28087 bool HasVariableMask, SelectionDAG &DAG,
28088 TargetLowering::DAGCombinerInfo &DCI,
28089 const X86Subtarget &Subtarget) {
28090 MVT VT = Root.getSimpleValueType();
28092 unsigned SizeInBits = VT.getSizeInBits();
28093 unsigned NumMaskElts = Mask.size();
28094 unsigned MaskSizeInBits = SizeInBits / NumMaskElts;
28095 unsigned NumOps = Ops.size();
28097 // Extract constant bits from each source op.
28098 bool OneUseConstantOp = false;
28099 SmallVector<APInt, 16> UndefEltsOps(NumOps);
28100 SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
28101 for (unsigned i = 0; i != NumOps; ++i) {
28102 SDValue SrcOp = Ops[i];
28103 OneUseConstantOp |= SrcOp.hasOneUse();
28104 if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
28109 // Only fold if at least one of the constants is only used once or
28110 // the combined shuffle has included a variable mask shuffle, this
28111 // is to avoid constant pool bloat.
28112 if (!OneUseConstantOp && !HasVariableMask)
28115 // Shuffle the constant bits according to the mask.
28116 APInt UndefElts(NumMaskElts, 0);
28117 APInt ZeroElts(NumMaskElts, 0);
28118 APInt ConstantElts(NumMaskElts, 0);
28119 SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
28120 APInt::getNullValue(MaskSizeInBits));
28121 for (unsigned i = 0; i != NumMaskElts; ++i) {
28123 if (M == SM_SentinelUndef) {
28124 UndefElts.setBit(i);
28126 } else if (M == SM_SentinelZero) {
28127 ZeroElts.setBit(i);
28130 assert(0 <= M && M < (int)(NumMaskElts * NumOps));
28132 unsigned SrcOpIdx = (unsigned)M / NumMaskElts;
28133 unsigned SrcMaskIdx = (unsigned)M % NumMaskElts;
28135 auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
28136 if (SrcUndefElts[SrcMaskIdx]) {
28137 UndefElts.setBit(i);
28141 auto &SrcEltBits = RawBitsOps[SrcOpIdx];
28142 APInt &Bits = SrcEltBits[SrcMaskIdx];
28144 ZeroElts.setBit(i);
28148 ConstantElts.setBit(i);
28149 ConstantBitData[i] = Bits;
28151 assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
28153 // Create the constant data.
28155 if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
28156 MaskSVT = MVT::getFloatingPointVT(MaskSizeInBits);
28158 MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
28160 MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
28163 SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
28164 DCI.AddToWorklist(CstOp.getNode());
28165 DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
28169 /// \brief Fully generic combining of x86 shuffle instructions.
28171 /// This should be the last combine run over the x86 shuffle instructions. Once
28172 /// they have been fully optimized, this will recursively consider all chains
28173 /// of single-use shuffle instructions, build a generic model of the cumulative
28174 /// shuffle operation, and check for simpler instructions which implement this
28175 /// operation. We use this primarily for two purposes:
28177 /// 1) Collapse generic shuffles to specialized single instructions when
28178 /// equivalent. In most cases, this is just an encoding size win, but
28179 /// sometimes we will collapse multiple generic shuffles into a single
28180 /// special-purpose shuffle.
28181 /// 2) Look for sequences of shuffle instructions with 3 or more total
28182 /// instructions, and replace them with the slightly more expensive SSSE3
28183 /// PSHUFB instruction if available. We do this as the last combining step
28184 /// to ensure we avoid using PSHUFB if we can implement the shuffle with
28185 /// a suitable short sequence of other instructions. The PSHUFB will either
28186 /// use a register or have to read from memory and so is slightly (but only
28187 /// slightly) more expensive than the other shuffle instructions.
28189 /// Because this is inherently a quadratic operation (for each shuffle in
28190 /// a chain, we recurse up the chain), the depth is limited to 8 instructions.
28191 /// This should never be an issue in practice as the shuffle lowering doesn't
28192 /// produce sequences of more than 8 instructions.
28194 /// FIXME: We will currently miss some cases where the redundant shuffling
28195 /// would simplify under the threshold for PSHUFB formation because of
28196 /// combine-ordering. To fix this, we should do the redundant instruction
28197 /// combining in this recursive walk.
28198 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
28199 int SrcOpIndex, SDValue Root,
28200 ArrayRef<int> RootMask,
28201 ArrayRef<const SDNode*> SrcNodes,
28202 int Depth, bool HasVariableMask,
28204 TargetLowering::DAGCombinerInfo &DCI,
28205 const X86Subtarget &Subtarget) {
28206 // Bound the depth of our recursive combine because this is ultimately
28207 // quadratic in nature.
28211 // Directly rip through bitcasts to find the underlying operand.
28212 SDValue Op = SrcOps[SrcOpIndex];
28213 Op = peekThroughOneUseBitcasts(Op);
28215 MVT VT = Op.getSimpleValueType();
28216 if (!VT.isVector())
28217 return false; // Bail if we hit a non-vector.
28219 assert(Root.getSimpleValueType().isVector() &&
28220 "Shuffles operate on vector types!");
28221 assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
28222 "Can only combine shuffles of the same vector register size.");
28224 // Extract target shuffle mask and resolve sentinels and inputs.
28225 SmallVector<int, 64> OpMask;
28226 SmallVector<SDValue, 2> OpInputs;
28227 if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
28230 assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
28231 SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
28232 SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
28234 // Add the inputs to the Ops list, avoiding duplicates.
28235 SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
28237 int InputIdx0 = -1, InputIdx1 = -1;
28238 for (int i = 0, e = Ops.size(); i < e; ++i) {
28239 SDValue BC = peekThroughBitcasts(Ops[i]);
28240 if (Input0 && BC == peekThroughBitcasts(Input0))
28242 if (Input1 && BC == peekThroughBitcasts(Input1))
28246 if (Input0 && InputIdx0 < 0) {
28247 InputIdx0 = SrcOpIndex;
28248 Ops[SrcOpIndex] = Input0;
28250 if (Input1 && InputIdx1 < 0) {
28251 InputIdx1 = Ops.size();
28252 Ops.push_back(Input1);
28255 assert(((RootMask.size() > OpMask.size() &&
28256 RootMask.size() % OpMask.size() == 0) ||
28257 (OpMask.size() > RootMask.size() &&
28258 OpMask.size() % RootMask.size() == 0) ||
28259 OpMask.size() == RootMask.size()) &&
28260 "The smaller number of elements must divide the larger.");
28262 // This function can be performance-critical, so we rely on the power-of-2
28263 // knowledge that we have about the mask sizes to replace div/rem ops with
28264 // bit-masks and shifts.
28265 assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
28266 assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
28267 unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
28268 unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
28270 unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
28271 unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
28272 unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
28273 assert((RootRatio == 1 || OpRatio == 1) &&
28274 "Must not have a ratio for both incoming and op masks!");
28276 assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
28277 assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
28278 assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
28279 unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
28280 unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
28282 SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
28284 // Merge this shuffle operation's mask into our accumulated mask. Note that
28285 // this shuffle's mask will be the first applied to the input, followed by the
28286 // root mask to get us all the way to the root value arrangement. The reason
28287 // for this order is that we are recursing up the operation chain.
28288 for (unsigned i = 0; i < MaskWidth; ++i) {
28289 unsigned RootIdx = i >> RootRatioLog2;
28290 if (RootMask[RootIdx] < 0) {
28291 // This is a zero or undef lane, we're done.
28292 Mask[i] = RootMask[RootIdx];
28296 unsigned RootMaskedIdx =
28298 ? RootMask[RootIdx]
28299 : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
28301 // Just insert the scaled root mask value if it references an input other
28302 // than the SrcOp we're currently inserting.
28303 if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
28304 (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
28305 Mask[i] = RootMaskedIdx;
28309 RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
28310 unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
28311 if (OpMask[OpIdx] < 0) {
28312 // The incoming lanes are zero or undef, it doesn't matter which ones we
28314 Mask[i] = OpMask[OpIdx];
28318 // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
28319 unsigned OpMaskedIdx =
28322 : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
28324 OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
28325 if (OpMask[OpIdx] < (int)OpMask.size()) {
28326 assert(0 <= InputIdx0 && "Unknown target shuffle input");
28327 OpMaskedIdx += InputIdx0 * MaskWidth;
28329 assert(0 <= InputIdx1 && "Unknown target shuffle input");
28330 OpMaskedIdx += InputIdx1 * MaskWidth;
28333 Mask[i] = OpMaskedIdx;
28336 // Handle the all undef/zero cases early.
28337 if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
28338 DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
28341 if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
28342 // TODO - should we handle the mixed zero/undef case as well? Just returning
28343 // a zero mask will lose information on undef elements possibly reducing
28344 // future combine possibilities.
28345 DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
28346 Subtarget, DAG, SDLoc(Root)));
28350 // Remove unused shuffle source ops.
28351 resolveTargetShuffleInputsAndMask(Ops, Mask);
28352 assert(!Ops.empty() && "Shuffle with no inputs detected");
28354 HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
28356 // Update the list of shuffle nodes that have been combined so far.
28357 SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
28359 CombinedNodes.push_back(Op.getNode());
28361 // See if we can recurse into each shuffle source op (if it's a target
28362 // shuffle). The source op should only be combined if it either has a
28363 // single use (i.e. current Op) or all its users have already been combined.
28364 for (int i = 0, e = Ops.size(); i < e; ++i)
28365 if (Ops[i].getNode()->hasOneUse() ||
28366 SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
28367 if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
28368 Depth + 1, HasVariableMask, DAG, DCI,
28372 // Attempt to constant fold all of the constant source ops.
28373 if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
28377 // We can only combine unary and binary shuffle mask cases.
28378 if (Ops.size() > 2)
28381 // Minor canonicalization of the accumulated shuffle mask to make it easier
28382 // to match below. All this does is detect masks with sequential pairs of
28383 // elements, and shrink them to the half-width mask. It does this in a loop
28384 // so it will reduce the size of the mask to the minimal width mask which
28385 // performs an equivalent shuffle.
28386 SmallVector<int, 64> WidenedMask;
28387 while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
28388 Mask = std::move(WidenedMask);
28391 // Canonicalization of binary shuffle masks to improve pattern matching by
28392 // commuting the inputs.
28393 if (Ops.size() == 2 && canonicalizeShuffleMaskWithCommute(Mask)) {
28394 ShuffleVectorSDNode::commuteMask(Mask);
28395 std::swap(Ops[0], Ops[1]);
28398 return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
28402 /// \brief Get the PSHUF-style mask from PSHUF node.
28404 /// This is a very minor wrapper around getTargetShuffleMask to easy forming v4
28405 /// PSHUF-style masks that can be reused with such instructions.
28406 static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
28407 MVT VT = N.getSimpleValueType();
28408 SmallVector<int, 4> Mask;
28409 SmallVector<SDValue, 2> Ops;
28412 getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary);
28416 // If we have more than 128-bits, only the low 128-bits of shuffle mask
28417 // matter. Check that the upper masks are repeats and remove them.
28418 if (VT.getSizeInBits() > 128) {
28419 int LaneElts = 128 / VT.getScalarSizeInBits();
28421 for (int i = 1, NumLanes = VT.getSizeInBits() / 128; i < NumLanes; ++i)
28422 for (int j = 0; j < LaneElts; ++j)
28423 assert(Mask[j] == Mask[i * LaneElts + j] - (LaneElts * i) &&
28424 "Mask doesn't repeat in high 128-bit lanes!");
28426 Mask.resize(LaneElts);
28429 switch (N.getOpcode()) {
28430 case X86ISD::PSHUFD:
28432 case X86ISD::PSHUFLW:
28435 case X86ISD::PSHUFHW:
28436 Mask.erase(Mask.begin(), Mask.begin() + 4);
28437 for (int &M : Mask)
28441 llvm_unreachable("No valid shuffle instruction found!");
28445 /// \brief Search for a combinable shuffle across a chain ending in pshufd.
28447 /// We walk up the chain and look for a combinable shuffle, skipping over
28448 /// shuffles that we could hoist this shuffle's transformation past without
28449 /// altering anything.
28451 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
28452 SelectionDAG &DAG) {
28453 assert(N.getOpcode() == X86ISD::PSHUFD &&
28454 "Called with something other than an x86 128-bit half shuffle!");
28457 // Walk up a single-use chain looking for a combinable shuffle. Keep a stack
28458 // of the shuffles in the chain so that we can form a fresh chain to replace
28460 SmallVector<SDValue, 8> Chain;
28461 SDValue V = N.getOperand(0);
28462 for (; V.hasOneUse(); V = V.getOperand(0)) {
28463 switch (V.getOpcode()) {
28465 return SDValue(); // Nothing combined!
28468 // Skip bitcasts as we always know the type for the target specific
28472 case X86ISD::PSHUFD:
28473 // Found another dword shuffle.
28476 case X86ISD::PSHUFLW:
28477 // Check that the low words (being shuffled) are the identity in the
28478 // dword shuffle, and the high words are self-contained.
28479 if (Mask[0] != 0 || Mask[1] != 1 ||
28480 !(Mask[2] >= 2 && Mask[2] < 4 && Mask[3] >= 2 && Mask[3] < 4))
28483 Chain.push_back(V);
28486 case X86ISD::PSHUFHW:
28487 // Check that the high words (being shuffled) are the identity in the
28488 // dword shuffle, and the low words are self-contained.
28489 if (Mask[2] != 2 || Mask[3] != 3 ||
28490 !(Mask[0] >= 0 && Mask[0] < 2 && Mask[1] >= 0 && Mask[1] < 2))
28493 Chain.push_back(V);
28496 case X86ISD::UNPCKL:
28497 case X86ISD::UNPCKH:
28498 // For either i8 -> i16 or i16 -> i32 unpacks, we can combine a dword
28499 // shuffle into a preceding word shuffle.
28500 if (V.getSimpleValueType().getVectorElementType() != MVT::i8 &&
28501 V.getSimpleValueType().getVectorElementType() != MVT::i16)
28504 // Search for a half-shuffle which we can combine with.
28505 unsigned CombineOp =
28506 V.getOpcode() == X86ISD::UNPCKL ? X86ISD::PSHUFLW : X86ISD::PSHUFHW;
28507 if (V.getOperand(0) != V.getOperand(1) ||
28508 !V->isOnlyUserOf(V.getOperand(0).getNode()))
28510 Chain.push_back(V);
28511 V = V.getOperand(0);
28513 switch (V.getOpcode()) {
28515 return SDValue(); // Nothing to combine.
28517 case X86ISD::PSHUFLW:
28518 case X86ISD::PSHUFHW:
28519 if (V.getOpcode() == CombineOp)
28522 Chain.push_back(V);
28526 V = V.getOperand(0);
28530 } while (V.hasOneUse());
28533 // Break out of the loop if we break out of the switch.
28537 if (!V.hasOneUse())
28538 // We fell out of the loop without finding a viable combining instruction.
28541 // Merge this node's mask and our incoming mask.
28542 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28543 for (int &M : Mask)
28545 V = DAG.getNode(V.getOpcode(), DL, V.getValueType(), V.getOperand(0),
28546 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28548 // Rebuild the chain around this new shuffle.
28549 while (!Chain.empty()) {
28550 SDValue W = Chain.pop_back_val();
28552 if (V.getValueType() != W.getOperand(0).getValueType())
28553 V = DAG.getBitcast(W.getOperand(0).getValueType(), V);
28555 switch (W.getOpcode()) {
28557 llvm_unreachable("Only PSHUF and UNPCK instructions get here!");
28559 case X86ISD::UNPCKL:
28560 case X86ISD::UNPCKH:
28561 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, V);
28564 case X86ISD::PSHUFD:
28565 case X86ISD::PSHUFLW:
28566 case X86ISD::PSHUFHW:
28567 V = DAG.getNode(W.getOpcode(), DL, W.getValueType(), V, W.getOperand(1));
28571 if (V.getValueType() != N.getValueType())
28572 V = DAG.getBitcast(N.getValueType(), V);
28574 // Return the new chain to replace N.
28578 /// \brief Search for a combinable shuffle across a chain ending in pshuflw or
28581 /// We walk up the chain, skipping shuffles of the other half and looking
28582 /// through shuffles which switch halves trying to find a shuffle of the same
28583 /// pair of dwords.
28584 static bool combineRedundantHalfShuffle(SDValue N, MutableArrayRef<int> Mask,
28586 TargetLowering::DAGCombinerInfo &DCI) {
28588 (N.getOpcode() == X86ISD::PSHUFLW || N.getOpcode() == X86ISD::PSHUFHW) &&
28589 "Called with something other than an x86 128-bit half shuffle!");
28591 unsigned CombineOpcode = N.getOpcode();
28593 // Walk up a single-use chain looking for a combinable shuffle.
28594 SDValue V = N.getOperand(0);
28595 for (; V.hasOneUse(); V = V.getOperand(0)) {
28596 switch (V.getOpcode()) {
28598 return false; // Nothing combined!
28601 // Skip bitcasts as we always know the type for the target specific
28605 case X86ISD::PSHUFLW:
28606 case X86ISD::PSHUFHW:
28607 if (V.getOpcode() == CombineOpcode)
28610 // Other-half shuffles are no-ops.
28613 // Break out of the loop if we break out of the switch.
28617 if (!V.hasOneUse())
28618 // We fell out of the loop without finding a viable combining instruction.
28621 // Combine away the bottom node as its shuffle will be accumulated into
28622 // a preceding shuffle.
28623 DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28625 // Record the old value.
28628 // Merge this node's mask and our incoming mask (adjusted to account for all
28629 // the pshufd instructions encountered).
28630 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28631 for (int &M : Mask)
28633 V = DAG.getNode(V.getOpcode(), DL, MVT::v8i16, V.getOperand(0),
28634 getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
28636 // Check that the shuffles didn't cancel each other out. If not, we need to
28637 // combine to the new one.
28639 // Replace the combinable shuffle with the combined one, updating all users
28640 // so that we re-evaluate the chain here.
28641 DCI.CombineTo(Old.getNode(), V, /*AddTo*/ true);
28646 /// \brief Try to combine x86 target specific shuffles.
28647 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
28648 TargetLowering::DAGCombinerInfo &DCI,
28649 const X86Subtarget &Subtarget) {
28651 MVT VT = N.getSimpleValueType();
28652 SmallVector<int, 4> Mask;
28654 unsigned Opcode = N.getOpcode();
28656 case X86ISD::PSHUFD:
28657 case X86ISD::PSHUFLW:
28658 case X86ISD::PSHUFHW:
28659 Mask = getPSHUFShuffleMask(N);
28660 assert(Mask.size() == 4);
28662 case X86ISD::UNPCKL: {
28663 auto Op0 = N.getOperand(0);
28664 auto Op1 = N.getOperand(1);
28665 unsigned Opcode0 = Op0.getOpcode();
28666 unsigned Opcode1 = Op1.getOpcode();
28668 // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
28669 // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
28670 // TODO: Add other horizontal operations as required.
28671 if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
28672 return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
28674 // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
28675 // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
28676 // moves upper half elements into the lower half part. For example:
28678 // t2: v16i8 = vector_shuffle<8,9,10,11,12,13,14,15,u,u,u,u,u,u,u,u> t1,
28680 // t3: v16i8 = X86ISD::UNPCKL undef:v16i8, t2
28682 // will be combined to:
28684 // t3: v16i8 = X86ISD::UNPCKH undef:v16i8, t1
28686 // This is only for 128-bit vectors. From SSE4.1 onward this combine may not
28687 // happen due to advanced instructions.
28688 if (!VT.is128BitVector())
28691 if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
28692 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
28694 unsigned NumElts = VT.getVectorNumElements();
28695 SmallVector<int, 8> ExpectedMask(NumElts, -1);
28696 std::iota(ExpectedMask.begin(), ExpectedMask.begin() + NumElts / 2,
28699 auto ShufOp = Op1.getOperand(0);
28700 if (isShuffleEquivalent(Op1, ShufOp, Mask, ExpectedMask))
28701 return DAG.getNode(X86ISD::UNPCKH, DL, VT, N.getOperand(0), ShufOp);
28705 case X86ISD::BLENDI: {
28706 SDValue V0 = N->getOperand(0);
28707 SDValue V1 = N->getOperand(1);
28708 assert(VT == V0.getSimpleValueType() && VT == V1.getSimpleValueType() &&
28709 "Unexpected input vector types");
28711 // Canonicalize a v2f64 blend with a mask of 2 by swapping the vector
28712 // operands and changing the mask to 1. This saves us a bunch of
28713 // pattern-matching possibilities related to scalar math ops in SSE/AVX.
28714 // x86InstrInfo knows how to commute this back after instruction selection
28715 // if it would help register allocation.
28717 // TODO: If optimizing for size or a processor that doesn't suffer from
28718 // partial register update stalls, this should be transformed into a MOVSD
28719 // instruction because a MOVSD is 1-2 bytes smaller than a BLENDPD.
28721 if (VT == MVT::v2f64)
28722 if (auto *Mask = dyn_cast<ConstantSDNode>(N->getOperand(2)))
28723 if (Mask->getZExtValue() == 2 && !isShuffleFoldableLoad(V0)) {
28724 SDValue NewMask = DAG.getConstant(1, DL, MVT::i8);
28725 return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask);
28730 case X86ISD::MOVSD:
28731 case X86ISD::MOVSS: {
28732 SDValue V0 = peekThroughBitcasts(N->getOperand(0));
28733 SDValue V1 = peekThroughBitcasts(N->getOperand(1));
28734 bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
28735 bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
28736 if (isZero0 && isZero1)
28739 // We often lower to MOVSD/MOVSS from integer as well as native float
28740 // types; remove unnecessary domain-crossing bitcasts if we can to make it
28741 // easier to combine shuffles later on. We've already accounted for the
28742 // domain switching cost when we decided to lower with it.
28743 bool isFloat = VT.isFloatingPoint();
28744 bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
28745 bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
28746 if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
28747 MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
28748 : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
28749 V0 = DAG.getBitcast(NewVT, V0);
28750 V1 = DAG.getBitcast(NewVT, V1);
28751 return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, NewVT, V0, V1));
28756 case X86ISD::INSERTPS: {
28757 assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
28758 SDValue Op0 = N.getOperand(0);
28759 SDValue Op1 = N.getOperand(1);
28760 SDValue Op2 = N.getOperand(2);
28761 unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
28762 unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
28763 unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
28764 unsigned ZeroMask = InsertPSMask & 0xF;
28766 // If we zero out all elements from Op0 then we don't need to reference it.
28767 if (((ZeroMask | (1u << DstIdx)) == 0xF) && !Op0.isUndef())
28768 return DAG.getNode(X86ISD::INSERTPS, DL, VT, DAG.getUNDEF(VT), Op1,
28769 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28771 // If we zero out the element from Op1 then we don't need to reference it.
28772 if ((ZeroMask & (1u << DstIdx)) && !Op1.isUndef())
28773 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28774 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28776 // Attempt to merge insertps Op1 with an inner target shuffle node.
28777 SmallVector<int, 8> TargetMask1;
28778 SmallVector<SDValue, 2> Ops1;
28779 if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) {
28780 int M = TargetMask1[SrcIdx];
28781 if (isUndefOrZero(M)) {
28782 // Zero/UNDEF insertion - zero out element and remove dependency.
28783 InsertPSMask |= (1u << DstIdx);
28784 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
28785 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28787 // Update insertps mask srcidx and reference the source input directly.
28788 assert(0 <= M && M < 8 && "Shuffle index out of range");
28789 InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6);
28790 Op1 = Ops1[M < 4 ? 0 : 1];
28791 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28792 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28795 // Attempt to merge insertps Op0 with an inner target shuffle node.
28796 SmallVector<int, 8> TargetMask0;
28797 SmallVector<SDValue, 2> Ops0;
28798 if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0))
28801 bool Updated = false;
28802 bool UseInput00 = false;
28803 bool UseInput01 = false;
28804 for (int i = 0; i != 4; ++i) {
28805 int M = TargetMask0[i];
28806 if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) {
28807 // No change if element is already zero or the inserted element.
28809 } else if (isUndefOrZero(M)) {
28810 // If the target mask is undef/zero then we must zero the element.
28811 InsertPSMask |= (1u << i);
28816 // The input vector element must be inline.
28817 if (M != i && M != (i + 4))
28820 // Determine which inputs of the target shuffle we're using.
28821 UseInput00 |= (0 <= M && M < 4);
28822 UseInput01 |= (4 <= M);
28825 // If we're not using both inputs of the target shuffle then use the
28826 // referenced input directly.
28827 if (UseInput00 && !UseInput01) {
28830 } else if (!UseInput00 && UseInput01) {
28836 return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1,
28837 DAG.getConstant(InsertPSMask, DL, MVT::i8));
28845 // Nuke no-op shuffles that show up after combining.
28846 if (isNoopShuffleMask(Mask))
28847 return DCI.CombineTo(N.getNode(), N.getOperand(0), /*AddTo*/ true);
28849 // Look for simplifications involving one or two shuffle instructions.
28850 SDValue V = N.getOperand(0);
28851 switch (N.getOpcode()) {
28854 case X86ISD::PSHUFLW:
28855 case X86ISD::PSHUFHW:
28856 assert(VT.getVectorElementType() == MVT::i16 && "Bad word shuffle type!");
28858 if (combineRedundantHalfShuffle(N, Mask, DAG, DCI))
28859 return SDValue(); // We combined away this shuffle, so we're done.
28861 // See if this reduces to a PSHUFD which is no more expensive and can
28862 // combine with more operations. Note that it has to at least flip the
28863 // dwords as otherwise it would have been removed as a no-op.
28864 if (makeArrayRef(Mask).equals({2, 3, 0, 1})) {
28865 int DMask[] = {0, 1, 2, 3};
28866 int DOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 2;
28867 DMask[DOffset + 0] = DOffset + 1;
28868 DMask[DOffset + 1] = DOffset + 0;
28869 MVT DVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
28870 V = DAG.getBitcast(DVT, V);
28871 DCI.AddToWorklist(V.getNode());
28872 V = DAG.getNode(X86ISD::PSHUFD, DL, DVT, V,
28873 getV4X86ShuffleImm8ForMask(DMask, DL, DAG));
28874 DCI.AddToWorklist(V.getNode());
28875 return DAG.getBitcast(VT, V);
28878 // Look for shuffle patterns which can be implemented as a single unpack.
28879 // FIXME: This doesn't handle the location of the PSHUFD generically, and
28880 // only works when we have a PSHUFD followed by two half-shuffles.
28881 if (Mask[0] == Mask[1] && Mask[2] == Mask[3] &&
28882 (V.getOpcode() == X86ISD::PSHUFLW ||
28883 V.getOpcode() == X86ISD::PSHUFHW) &&
28884 V.getOpcode() != N.getOpcode() &&
28886 SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
28887 if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
28888 SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
28889 SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
28890 int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28891 int VOffset = V.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
28893 for (int i = 0; i < 4; ++i) {
28894 WordMask[i + NOffset] = Mask[i] + NOffset;
28895 WordMask[i + VOffset] = VMask[i] + VOffset;
28897 // Map the word mask through the DWord mask.
28899 for (int i = 0; i < 8; ++i)
28900 MappedMask[i] = 2 * DMask[WordMask[i] / 2] + WordMask[i] % 2;
28901 if (makeArrayRef(MappedMask).equals({0, 0, 1, 1, 2, 2, 3, 3}) ||
28902 makeArrayRef(MappedMask).equals({4, 4, 5, 5, 6, 6, 7, 7})) {
28903 // We can replace all three shuffles with an unpack.
28904 V = DAG.getBitcast(VT, D.getOperand(0));
28905 DCI.AddToWorklist(V.getNode());
28906 return DAG.getNode(MappedMask[0] == 0 ? X86ISD::UNPCKL
28915 case X86ISD::PSHUFD:
28916 if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
28925 /// Returns true iff the shuffle node \p N can be replaced with ADDSUB
28926 /// operation. If true is returned then the operands of ADDSUB operation
28927 /// are written to the parameters \p Opnd0 and \p Opnd1.
28929 /// We combine shuffle to ADDSUB directly on the abstract vector shuffle nodes
28930 /// so it is easier to generically match. We also insert dummy vector shuffle
28931 /// nodes for the operands which explicitly discard the lanes which are unused
28932 /// by this operation to try to flow through the rest of the combiner
28933 /// the fact that they're unused.
28934 static bool isAddSub(SDNode *N, const X86Subtarget &Subtarget,
28935 SDValue &Opnd0, SDValue &Opnd1) {
28937 EVT VT = N->getValueType(0);
28938 if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
28939 (!Subtarget.hasAVX() || (VT != MVT::v8f32 && VT != MVT::v4f64)) &&
28940 (!Subtarget.hasAVX512() || (VT != MVT::v16f32 && VT != MVT::v8f64)))
28943 // We only handle target-independent shuffles.
28944 // FIXME: It would be easy and harmless to use the target shuffle mask
28945 // extraction tool to support more.
28946 if (N->getOpcode() != ISD::VECTOR_SHUFFLE)
28949 ArrayRef<int> OrigMask = cast<ShuffleVectorSDNode>(N)->getMask();
28950 SmallVector<int, 16> Mask(OrigMask.begin(), OrigMask.end());
28952 SDValue V1 = N->getOperand(0);
28953 SDValue V2 = N->getOperand(1);
28955 // We require the first shuffle operand to be the FSUB node, and the second to
28956 // be the FADD node.
28957 if (V1.getOpcode() == ISD::FADD && V2.getOpcode() == ISD::FSUB) {
28958 ShuffleVectorSDNode::commuteMask(Mask);
28960 } else if (V1.getOpcode() != ISD::FSUB || V2.getOpcode() != ISD::FADD)
28963 // If there are other uses of these operations we can't fold them.
28964 if (!V1->hasOneUse() || !V2->hasOneUse())
28967 // Ensure that both operations have the same operands. Note that we can
28968 // commute the FADD operands.
28969 SDValue LHS = V1->getOperand(0), RHS = V1->getOperand(1);
28970 if ((V2->getOperand(0) != LHS || V2->getOperand(1) != RHS) &&
28971 (V2->getOperand(0) != RHS || V2->getOperand(1) != LHS))
28974 // We're looking for blends between FADD and FSUB nodes. We insist on these
28975 // nodes being lined up in a specific expected pattern.
28976 if (!(isShuffleEquivalent(V1, V2, Mask, {0, 3}) ||
28977 isShuffleEquivalent(V1, V2, Mask, {0, 5, 2, 7}) ||
28978 isShuffleEquivalent(V1, V2, Mask, {0, 9, 2, 11, 4, 13, 6, 15}) ||
28979 isShuffleEquivalent(V1, V2, Mask, {0, 17, 2, 19, 4, 21, 6, 23,
28980 8, 25, 10, 27, 12, 29, 14, 31})))
28988 /// \brief Try to combine a shuffle into a target-specific add-sub or
28989 /// mul-add-sub node.
28990 static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
28991 const X86Subtarget &Subtarget,
28992 SelectionDAG &DAG) {
28993 SDValue Opnd0, Opnd1;
28994 if (!isAddSub(N, Subtarget, Opnd0, Opnd1))
28997 EVT VT = N->getValueType(0);
29000 // Try to generate X86ISD::FMADDSUB node here.
29002 if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
29003 return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
29005 // Do not generate X86ISD::ADDSUB node for 512-bit types even though
29006 // the ADDSUB idiom has been successfully recognized. There are no known
29007 // X86 targets with 512-bit ADDSUB instructions!
29008 if (VT.is512BitVector())
29011 return DAG.getNode(X86ISD::ADDSUB, DL, VT, Opnd0, Opnd1);
29014 // We are looking for a shuffle where both sources are concatenated with undef
29015 // and have a width that is half of the output's width. AVX2 has VPERMD/Q, so
29016 // if we can express this as a single-source shuffle, that's preferable.
29017 static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
29018 const X86Subtarget &Subtarget) {
29019 if (!Subtarget.hasAVX2() || !isa<ShuffleVectorSDNode>(N))
29022 EVT VT = N->getValueType(0);
29024 // We only care about shuffles of 128/256-bit vectors of 32/64-bit values.
29025 if (!VT.is128BitVector() && !VT.is256BitVector())
29028 if (VT.getVectorElementType() != MVT::i32 &&
29029 VT.getVectorElementType() != MVT::i64 &&
29030 VT.getVectorElementType() != MVT::f32 &&
29031 VT.getVectorElementType() != MVT::f64)
29034 SDValue N0 = N->getOperand(0);
29035 SDValue N1 = N->getOperand(1);
29037 // Check that both sources are concats with undef.
29038 if (N0.getOpcode() != ISD::CONCAT_VECTORS ||
29039 N1.getOpcode() != ISD::CONCAT_VECTORS || N0.getNumOperands() != 2 ||
29040 N1.getNumOperands() != 2 || !N0.getOperand(1).isUndef() ||
29041 !N1.getOperand(1).isUndef())
29044 // Construct the new shuffle mask. Elements from the first source retain their
29045 // index, but elements from the second source no longer need to skip an undef.
29046 SmallVector<int, 8> Mask;
29047 int NumElts = VT.getVectorNumElements();
29049 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29050 for (int Elt : SVOp->getMask())
29051 Mask.push_back(Elt < NumElts ? Elt : (Elt - NumElts / 2));
29054 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, N0.getOperand(0),
29056 return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
29059 static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
29060 TargetLowering::DAGCombinerInfo &DCI,
29061 const X86Subtarget &Subtarget) {
29063 EVT VT = N->getValueType(0);
29064 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29065 // If we have legalized the vector types, look for blends of FADD and FSUB
29066 // nodes that we can fuse into an ADDSUB node.
29067 if (TLI.isTypeLegal(VT))
29068 if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
29071 // During Type Legalization, when promoting illegal vector types,
29072 // the backend might introduce new shuffle dag nodes and bitcasts.
29074 // This code performs the following transformation:
29075 // fold: (shuffle (bitcast (BINOP A, B)), Undef, <Mask>) ->
29076 // (shuffle (BINOP (bitcast A), (bitcast B)), Undef, <Mask>)
29078 // We do this only if both the bitcast and the BINOP dag nodes have
29079 // one use. Also, perform this transformation only if the new binary
29080 // operation is legal. This is to avoid introducing dag nodes that
29081 // potentially need to be further expanded (or custom lowered) into a
29082 // less optimal sequence of dag nodes.
29083 if (!DCI.isBeforeLegalize() && DCI.isBeforeLegalizeOps() &&
29084 N->getOpcode() == ISD::VECTOR_SHUFFLE &&
29085 N->getOperand(0).getOpcode() == ISD::BITCAST &&
29086 N->getOperand(1).isUndef() && N->getOperand(0).hasOneUse()) {
29087 SDValue N0 = N->getOperand(0);
29088 SDValue N1 = N->getOperand(1);
29090 SDValue BC0 = N0.getOperand(0);
29091 EVT SVT = BC0.getValueType();
29092 unsigned Opcode = BC0.getOpcode();
29093 unsigned NumElts = VT.getVectorNumElements();
29095 if (BC0.hasOneUse() && SVT.isVector() &&
29096 SVT.getVectorNumElements() * 2 == NumElts &&
29097 TLI.isOperationLegal(Opcode, VT)) {
29098 bool CanFold = false;
29104 // isOperationLegal lies for integer ops on floating point types.
29105 CanFold = VT.isInteger();
29110 // isOperationLegal lies for floating point ops on integer types.
29111 CanFold = VT.isFloatingPoint();
29115 unsigned SVTNumElts = SVT.getVectorNumElements();
29116 ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(N);
29117 for (unsigned i = 0, e = SVTNumElts; i != e && CanFold; ++i)
29118 CanFold = SVOp->getMaskElt(i) == (int)(i * 2);
29119 for (unsigned i = SVTNumElts, e = NumElts; i != e && CanFold; ++i)
29120 CanFold = SVOp->getMaskElt(i) < 0;
29123 SDValue BC00 = DAG.getBitcast(VT, BC0.getOperand(0));
29124 SDValue BC01 = DAG.getBitcast(VT, BC0.getOperand(1));
29125 SDValue NewBinOp = DAG.getNode(BC0.getOpcode(), dl, VT, BC00, BC01);
29126 return DAG.getVectorShuffle(VT, dl, NewBinOp, N1, SVOp->getMask());
29131 // Combine a vector_shuffle that is equal to build_vector load1, load2, load3,
29132 // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
29133 // consecutive, non-overlapping, and in the right order.
29134 SmallVector<SDValue, 16> Elts;
29135 for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
29136 if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
29137 Elts.push_back(Elt);
29144 if (Elts.size() == VT.getVectorNumElements())
29146 EltsFromConsecutiveLoads(VT, Elts, dl, DAG, Subtarget, true))
29149 // For AVX2, we sometimes want to combine
29150 // (vector_shuffle <mask> (concat_vectors t1, undef)
29151 // (concat_vectors t2, undef))
29153 // (vector_shuffle <mask> (concat_vectors t1, t2), undef)
29154 // Since the latter can be efficiently lowered with VPERMD/VPERMQ
29155 if (SDValue ShufConcat = combineShuffleOfConcatUndef(N, DAG, Subtarget))
29158 if (isTargetShuffle(N->getOpcode())) {
29160 if (SDValue Shuffle = combineTargetShuffle(Op, DAG, DCI, Subtarget))
29163 // Try recursively combining arbitrary sequences of x86 shuffle
29164 // instructions into higher-order shuffles. We do this after combining
29165 // specific PSHUF instruction sequences into their minimal form so that we
29166 // can evaluate how many specialized shuffle instructions are involved in
29167 // a particular chain.
29168 SmallVector<int, 1> NonceMask; // Just a placeholder.
29169 NonceMask.push_back(0);
29170 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
29171 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
29173 return SDValue(); // This routine will use CombineTo to replace N.
29179 /// Check if a vector extract from a target-specific shuffle of a load can be
29180 /// folded into a single element load.
29181 /// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
29182 /// shuffles have been custom lowered so we need to handle those here.
29183 static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
29184 TargetLowering::DAGCombinerInfo &DCI) {
29185 if (DCI.isBeforeLegalizeOps())
29188 SDValue InVec = N->getOperand(0);
29189 SDValue EltNo = N->getOperand(1);
29190 EVT EltVT = N->getValueType(0);
29192 if (!isa<ConstantSDNode>(EltNo))
29195 EVT OriginalVT = InVec.getValueType();
29197 // Peek through bitcasts, don't duplicate a load with other uses.
29198 InVec = peekThroughOneUseBitcasts(InVec);
29200 EVT CurrentVT = InVec.getValueType();
29201 if (!CurrentVT.isVector() ||
29202 CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
29205 if (!isTargetShuffle(InVec.getOpcode()))
29208 // Don't duplicate a load with other uses.
29209 if (!InVec.hasOneUse())
29212 SmallVector<int, 16> ShuffleMask;
29213 SmallVector<SDValue, 2> ShuffleOps;
29215 if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
29216 ShuffleOps, ShuffleMask, UnaryShuffle))
29219 // Select the input vector, guarding against out of range extract vector.
29220 unsigned NumElems = CurrentVT.getVectorNumElements();
29221 int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
29222 int Idx = (Elt > (int)NumElems) ? SM_SentinelUndef : ShuffleMask[Elt];
29224 if (Idx == SM_SentinelZero)
29225 return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
29226 : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
29227 if (Idx == SM_SentinelUndef)
29228 return DAG.getUNDEF(EltVT);
29230 assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range");
29231 SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0]
29234 // If inputs to shuffle are the same for both ops, then allow 2 uses
29235 unsigned AllowedUses =
29236 (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
29238 if (LdNode.getOpcode() == ISD::BITCAST) {
29239 // Don't duplicate a load with other uses.
29240 if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
29243 AllowedUses = 1; // only allow 1 load use if we have a bitcast
29244 LdNode = LdNode.getOperand(0);
29247 if (!ISD::isNormalLoad(LdNode.getNode()))
29250 LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
29252 if (!LN0 ||!LN0->hasNUsesOfValue(AllowedUses, 0) || LN0->isVolatile())
29255 // If there's a bitcast before the shuffle, check if the load type and
29256 // alignment is valid.
29257 unsigned Align = LN0->getAlignment();
29258 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29259 unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
29260 EltVT.getTypeForEVT(*DAG.getContext()));
29262 if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
29265 // All checks match so transform back to vector_shuffle so that DAG combiner
29266 // can finish the job
29269 // Create shuffle node taking into account the case that its a unary shuffle
29270 SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1];
29271 Shuffle = DAG.getVectorShuffle(CurrentVT, dl, ShuffleOps[0], Shuffle,
29273 Shuffle = DAG.getBitcast(OriginalVT, Shuffle);
29274 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
29278 // Try to match patterns such as
29279 // (i16 bitcast (v16i1 x))
29281 // (i16 movmsk (16i8 sext (v16i1 x)))
29282 // before the illegal vector is scalarized on subtargets that don't have legal
29284 static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
29285 const X86Subtarget &Subtarget) {
29286 EVT VT = BitCast.getValueType();
29287 SDValue N0 = BitCast.getOperand(0);
29288 EVT VecVT = N0->getValueType(0);
29290 if (!VT.isScalarInteger() || !VecVT.isSimple())
29293 // With AVX512 vxi1 types are legal and we prefer using k-regs.
29294 // MOVMSK is supported in SSE2 or later.
29295 if (Subtarget.hasAVX512() || !Subtarget.hasSSE2())
29298 // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
29299 // v8f64. So all legal 128-bit and 256-bit vectors are covered except for
29300 // v8i16 and v16i16.
29301 // For these two cases, we can shuffle the upper element bytes to a
29302 // consecutive sequence at the start of the vector and treat the results as
29303 // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
29304 // for v16i16 this is not the case, because the shuffle is expensive, so we
29305 // avoid sign-extending to this type entirely.
29306 // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
29307 // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef)
29309 MVT FPCastVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
29310 switch (VecVT.getSimpleVT().SimpleTy) {
29314 SExtVT = MVT::v2i64;
29315 FPCastVT = MVT::v2f64;
29318 SExtVT = MVT::v4i32;
29319 FPCastVT = MVT::v4f32;
29320 // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
29321 // sign-extend to a 256-bit operation to avoid truncation.
29322 if (N0->getOpcode() == ISD::SETCC &&
29323 N0->getOperand(0)->getValueType(0).is256BitVector() &&
29324 Subtarget.hasInt256()) {
29325 SExtVT = MVT::v4i64;
29326 FPCastVT = MVT::v4f64;
29330 SExtVT = MVT::v8i16;
29331 // For cases such as (i8 bitcast (v8i1 setcc v8i32 v1, v2)),
29332 // sign-extend to a 256-bit operation to match the compare.
29333 // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
29334 // 256-bit because the shuffle is cheaper than sign extending the result of
29336 if (N0->getOpcode() == ISD::SETCC &&
29337 N0->getOperand(0)->getValueType(0).is256BitVector() &&
29338 Subtarget.hasInt256()) {
29339 SExtVT = MVT::v8i32;
29340 FPCastVT = MVT::v8f32;
29344 SExtVT = MVT::v16i8;
29345 // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)),
29346 // it is not profitable to sign-extend to 256-bit because this will
29347 // require an extra cross-lane shuffle which is more expensive than
29348 // truncating the result of the compare to 128-bits.
29351 // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
29352 if (!Subtarget.hasInt256())
29354 SExtVT = MVT::v32i8;
29359 SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
29360 if (SExtVT == MVT::v8i16) {
29361 V = DAG.getBitcast(MVT::v16i8, V);
29362 V = DAG.getVectorShuffle(
29363 MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
29364 {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
29366 assert(SExtVT.getScalarType() != MVT::i16 &&
29367 "Vectors of i16 must be shuffled");
29368 if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
29369 V = DAG.getBitcast(FPCastVT, V);
29370 V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
29371 return DAG.getZExtOrTrunc(V, DL, VT);
29374 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
29375 TargetLowering::DAGCombinerInfo &DCI,
29376 const X86Subtarget &Subtarget) {
29377 SDValue N0 = N->getOperand(0);
29378 EVT VT = N->getValueType(0);
29379 EVT SrcVT = N0.getValueType();
29381 // Try to match patterns such as
29382 // (i16 bitcast (v16i1 x))
29384 // (i16 movmsk (16i8 sext (v16i1 x)))
29385 // before the setcc result is scalarized on subtargets that don't have legal
29387 if (DCI.isBeforeLegalize())
29388 if (SDValue V = combineBitcastvxi1(DAG, SDValue(N, 0), Subtarget))
29390 // Since MMX types are special and don't usually play with other vector types,
29391 // it's better to handle them early to be sure we emit efficient code by
29392 // avoiding store-load conversions.
29394 // Detect bitcasts between i32 to x86mmx low word.
29395 if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
29396 SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
29397 SDValue N00 = N0->getOperand(0);
29398 if (N00.getValueType() == MVT::i32)
29399 return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
29402 // Detect bitcasts between element or subvector extraction to x86mmx.
29403 if (VT == MVT::x86mmx &&
29404 (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
29405 N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
29406 isNullConstant(N0.getOperand(1))) {
29407 SDValue N00 = N0->getOperand(0);
29408 if (N00.getValueType().is128BitVector())
29409 return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
29410 DAG.getBitcast(MVT::v2i64, N00));
29413 // Detect bitcasts from FP_TO_SINT to x86mmx.
29414 if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
29415 N0.getOpcode() == ISD::FP_TO_SINT) {
29417 SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
29418 DAG.getUNDEF(MVT::v2i32));
29419 return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
29420 DAG.getBitcast(MVT::v2i64, Res));
29423 // Convert a bitcasted integer logic operation that has one bitcasted
29424 // floating-point operand into a floating-point logic operation. This may
29425 // create a load of a constant, but that is cheaper than materializing the
29426 // constant in an integer register and transferring it to an SSE register or
29427 // transferring the SSE operand to integer register and back.
29429 switch (N0.getOpcode()) {
29430 case ISD::AND: FPOpcode = X86ISD::FAND; break;
29431 case ISD::OR: FPOpcode = X86ISD::FOR; break;
29432 case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
29433 default: return SDValue();
29436 if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
29437 (Subtarget.hasSSE2() && VT == MVT::f64)))
29440 SDValue LogicOp0 = N0.getOperand(0);
29441 SDValue LogicOp1 = N0.getOperand(1);
29444 // bitcast(logic(bitcast(X), Y)) --> logic'(X, bitcast(Y))
29445 if (N0.hasOneUse() && LogicOp0.getOpcode() == ISD::BITCAST &&
29446 LogicOp0.hasOneUse() && LogicOp0.getOperand(0).getValueType() == VT &&
29447 !isa<ConstantSDNode>(LogicOp0.getOperand(0))) {
29448 SDValue CastedOp1 = DAG.getBitcast(VT, LogicOp1);
29449 return DAG.getNode(FPOpcode, DL0, VT, LogicOp0.getOperand(0), CastedOp1);
29451 // bitcast(logic(X, bitcast(Y))) --> logic'(bitcast(X), Y)
29452 if (N0.hasOneUse() && LogicOp1.getOpcode() == ISD::BITCAST &&
29453 LogicOp1.hasOneUse() && LogicOp1.getOperand(0).getValueType() == VT &&
29454 !isa<ConstantSDNode>(LogicOp1.getOperand(0))) {
29455 SDValue CastedOp0 = DAG.getBitcast(VT, LogicOp0);
29456 return DAG.getNode(FPOpcode, DL0, VT, LogicOp1.getOperand(0), CastedOp0);
29462 // Match a binop + shuffle pyramid that represents a horizontal reduction over
29463 // the elements of a vector.
29464 // Returns the vector that is being reduced on, or SDValue() if a reduction
29465 // was not matched.
29466 static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
29467 // The pattern must end in an extract from index 0.
29468 if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
29469 !isNullConstant(Extract->getOperand(1)))
29473 Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
29475 SDValue Op = Extract->getOperand(0);
29476 // At each stage, we're looking for something that looks like:
29477 // %s = shufflevector <8 x i32> %op, <8 x i32> undef,
29478 // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
29479 // i32 undef, i32 undef, i32 undef, i32 undef>
29480 // %a = binop <8 x i32> %op, %s
29481 // Where the mask changes according to the stage. E.g. for a 3-stage pyramid,
29482 // we expect something like:
29483 // <4,5,6,7,u,u,u,u>
29484 // <2,3,u,u,u,u,u,u>
29485 // <1,u,u,u,u,u,u,u>
29486 for (unsigned i = 0; i < Stages; ++i) {
29487 if (Op.getOpcode() != BinOp)
29490 ShuffleVectorSDNode *Shuffle =
29491 dyn_cast<ShuffleVectorSDNode>(Op.getOperand(0).getNode());
29493 Op = Op.getOperand(1);
29495 Shuffle = dyn_cast<ShuffleVectorSDNode>(Op.getOperand(1).getNode());
29496 Op = Op.getOperand(0);
29499 // The first operand of the shuffle should be the same as the other operand
29501 if (!Shuffle || (Shuffle->getOperand(0) != Op))
29504 // Verify the shuffle has the expected (at this stage of the pyramid) mask.
29505 for (int Index = 0, MaskEnd = 1 << i; Index < MaskEnd; ++Index)
29506 if (Shuffle->getMaskElt(Index) != MaskEnd + Index)
29513 // Given a select, detect the following pattern:
29514 // 1: %2 = zext <N x i8> %0 to <N x i32>
29515 // 2: %3 = zext <N x i8> %1 to <N x i32>
29516 // 3: %4 = sub nsw <N x i32> %2, %3
29517 // 4: %5 = icmp sgt <N x i32> %4, [0 x N] or [-1 x N]
29518 // 5: %6 = sub nsw <N x i32> zeroinitializer, %4
29519 // 6: %7 = select <N x i1> %5, <N x i32> %4, <N x i32> %6
29520 // This is useful as it is the input into a SAD pattern.
29521 static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
29523 // Check the condition of the select instruction is greater-than.
29524 SDValue SetCC = Select->getOperand(0);
29525 if (SetCC.getOpcode() != ISD::SETCC)
29527 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
29528 if (CC != ISD::SETGT && CC != ISD::SETLT)
29531 SDValue SelectOp1 = Select->getOperand(1);
29532 SDValue SelectOp2 = Select->getOperand(2);
29534 // The following instructions assume SelectOp1 is the subtraction operand
29535 // and SelectOp2 is the negation operand.
29536 // In the case of SETLT this is the other way around.
29537 if (CC == ISD::SETLT)
29538 std::swap(SelectOp1, SelectOp2);
29540 // The second operand of the select should be the negation of the first
29541 // operand, which is implemented as 0 - SelectOp1.
29542 if (!(SelectOp2.getOpcode() == ISD::SUB &&
29543 ISD::isBuildVectorAllZeros(SelectOp2.getOperand(0).getNode()) &&
29544 SelectOp2.getOperand(1) == SelectOp1))
29547 // The first operand of SetCC is the first operand of the select, which is the
29548 // difference between the two input vectors.
29549 if (SetCC.getOperand(0) != SelectOp1)
29552 // In SetLT case, The second operand of the comparison can be either 1 or 0.
29554 if ((CC == ISD::SETLT) &&
29555 !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal,
29556 /*AllowShrink*/false) &&
29557 SplatVal.isOneValue()) ||
29558 (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
29561 // In SetGT case, The second operand of the comparison can be either -1 or 0.
29562 if ((CC == ISD::SETGT) &&
29563 !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
29564 ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
29567 // The first operand of the select is the difference between the two input
29569 if (SelectOp1.getOpcode() != ISD::SUB)
29572 Op0 = SelectOp1.getOperand(0);
29573 Op1 = SelectOp1.getOperand(1);
29575 // Check if the operands of the sub are zero-extended from vectors of i8.
29576 if (Op0.getOpcode() != ISD::ZERO_EXTEND ||
29577 Op0.getOperand(0).getValueType().getVectorElementType() != MVT::i8 ||
29578 Op1.getOpcode() != ISD::ZERO_EXTEND ||
29579 Op1.getOperand(0).getValueType().getVectorElementType() != MVT::i8)
29585 // Given two zexts of <k x i8> to <k x i32>, create a PSADBW of the inputs
29587 static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
29588 const SDValue &Zext1, const SDLoc &DL) {
29590 // Find the appropriate width for the PSADBW.
29591 EVT InVT = Zext0.getOperand(0).getValueType();
29592 unsigned RegSize = std::max(128u, InVT.getSizeInBits());
29594 // "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
29595 // fill in the missing vector elements with 0.
29596 unsigned NumConcat = RegSize / InVT.getSizeInBits();
29597 SmallVector<SDValue, 16> Ops(NumConcat, DAG.getConstant(0, DL, InVT));
29598 Ops[0] = Zext0.getOperand(0);
29599 MVT ExtendedVT = MVT::getVectorVT(MVT::i8, RegSize / 8);
29600 SDValue SadOp0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29601 Ops[0] = Zext1.getOperand(0);
29602 SDValue SadOp1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, ExtendedVT, Ops);
29604 // Actually build the SAD
29605 MVT SadVT = MVT::getVectorVT(MVT::i64, RegSize / 64);
29606 return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
29609 // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
29610 static SDValue combineHorizontalPredicateResult(SDNode *Extract,
29612 const X86Subtarget &Subtarget) {
29613 // Bail without SSE2 or with AVX512VL (which uses predicate registers).
29614 if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
29617 EVT ExtractVT = Extract->getValueType(0);
29618 unsigned BitWidth = ExtractVT.getSizeInBits();
29619 if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
29620 ExtractVT != MVT::i8)
29623 // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
29624 for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
29625 SDValue Match = matchBinOpReduction(Extract, Op);
29629 // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
29630 // which we can't support here for now.
29631 if (Match.getScalarValueSizeInBits() != BitWidth)
29634 // We require AVX2 for PMOVMSKB for v16i16/v32i8;
29635 unsigned MatchSizeInBits = Match.getValueSizeInBits();
29636 if (!(MatchSizeInBits == 128 ||
29637 (MatchSizeInBits == 256 &&
29638 ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
29641 // Don't bother performing this for 2-element vectors.
29642 if (Match.getValueType().getVectorNumElements() <= 2)
29645 // Check that we are extracting a reduction of all sign bits.
29646 if (DAG.ComputeNumSignBits(Match) != BitWidth)
29649 // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
29651 if (64 == BitWidth || 32 == BitWidth)
29652 MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
29653 MatchSizeInBits / BitWidth);
29655 MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
29658 ISD::CondCode CondCode;
29659 if (Op == ISD::OR) {
29660 // any_of -> MOVMSK != 0
29661 CompareBits = APInt::getNullValue(32);
29662 CondCode = ISD::CondCode::SETNE;
29664 // all_of -> MOVMSK == ((1 << NumElts) - 1)
29665 CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
29666 CondCode = ISD::CondCode::SETEQ;
29669 // Perform the select as i32/i64 and then truncate to avoid partial register
29671 unsigned ResWidth = std::max(BitWidth, 32u);
29672 EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
29674 SDValue Zero = DAG.getConstant(0, DL, ResVT);
29675 SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
29676 SDValue Res = DAG.getBitcast(MaskVT, Match);
29677 Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
29678 Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
29679 Ones, Zero, CondCode);
29680 return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
29686 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
29687 const X86Subtarget &Subtarget) {
29688 // PSADBW is only supported on SSE2 and up.
29689 if (!Subtarget.hasSSE2())
29692 // Verify the type we're extracting from is any integer type above i16.
29693 EVT VT = Extract->getOperand(0).getValueType();
29694 if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
29697 unsigned RegSize = 128;
29698 if (Subtarget.hasBWI())
29700 else if (Subtarget.hasAVX2())
29703 // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
29704 // TODO: We should be able to handle larger vectors by splitting them before
29705 // feeding them into several SADs, and then reducing over those.
29706 if (RegSize / VT.getVectorNumElements() < 8)
29709 // Match shuffle + add pyramid.
29710 SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
29712 // The operand is expected to be zero extended from i8
29713 // (verified in detectZextAbsDiff).
29714 // In order to convert to i64 and above, additional any/zero/sign
29715 // extend is expected.
29716 // The zero extend from 32 bit has no mathematical effect on the result.
29717 // Also the sign extend is basically zero extend
29718 // (extends the sign bit which is zero).
29719 // So it is correct to skip the sign/zero extend instruction.
29720 if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
29721 Root.getOpcode() == ISD::ZERO_EXTEND ||
29722 Root.getOpcode() == ISD::ANY_EXTEND))
29723 Root = Root.getOperand(0);
29725 // If there was a match, we want Root to be a select that is the root of an
29726 // abs-diff pattern.
29727 if (!Root || (Root.getOpcode() != ISD::VSELECT))
29730 // Check whether we have an abs-diff pattern feeding into the select.
29731 SDValue Zext0, Zext1;
29732 if (!detectZextAbsDiff(Root, Zext0, Zext1))
29735 // Create the SAD instruction.
29737 SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
29739 // If the original vector was wider than 8 elements, sum over the results
29740 // in the SAD vector.
29741 unsigned Stages = Log2_32(VT.getVectorNumElements());
29742 MVT SadVT = SAD.getSimpleValueType();
29744 unsigned SadElems = SadVT.getVectorNumElements();
29746 for(unsigned i = Stages - 3; i > 0; --i) {
29747 SmallVector<int, 16> Mask(SadElems, -1);
29748 for(unsigned j = 0, MaskEnd = 1 << (i - 1); j < MaskEnd; ++j)
29749 Mask[j] = MaskEnd + j;
29752 DAG.getVectorShuffle(SadVT, DL, SAD, DAG.getUNDEF(SadVT), Mask);
29753 SAD = DAG.getNode(ISD::ADD, DL, SadVT, SAD, Shuffle);
29757 MVT Type = Extract->getSimpleValueType(0);
29758 unsigned TypeSizeInBits = Type.getSizeInBits();
29759 // Return the lowest TypeSizeInBits bits.
29760 MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
29761 SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
29762 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
29763 Extract->getOperand(1));
29766 // Attempt to peek through a target shuffle and extract the scalar from the
29768 static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
29769 TargetLowering::DAGCombinerInfo &DCI,
29770 const X86Subtarget &Subtarget) {
29771 if (DCI.isBeforeLegalizeOps())
29774 SDValue Src = N->getOperand(0);
29775 SDValue Idx = N->getOperand(1);
29777 EVT VT = N->getValueType(0);
29778 EVT SrcVT = Src.getValueType();
29779 EVT SrcSVT = SrcVT.getVectorElementType();
29780 unsigned NumSrcElts = SrcVT.getVectorNumElements();
29782 // Don't attempt this for boolean mask vectors or unknown extraction indices.
29783 if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
29786 // Resolve the target shuffle inputs and mask.
29787 SmallVector<int, 16> Mask;
29788 SmallVector<SDValue, 2> Ops;
29789 if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask, DAG))
29792 // Attempt to narrow/widen the shuffle mask to the correct size.
29793 if (Mask.size() != NumSrcElts) {
29794 if ((NumSrcElts % Mask.size()) == 0) {
29795 SmallVector<int, 16> ScaledMask;
29796 int Scale = NumSrcElts / Mask.size();
29797 scaleShuffleMask(Scale, Mask, ScaledMask);
29798 Mask = std::move(ScaledMask);
29799 } else if ((Mask.size() % NumSrcElts) == 0) {
29800 SmallVector<int, 16> WidenedMask;
29801 while (Mask.size() > NumSrcElts &&
29802 canWidenShuffleElements(Mask, WidenedMask))
29803 Mask = std::move(WidenedMask);
29804 // TODO - investigate support for wider shuffle masks with known upper
29805 // undef/zero elements for implicit zero-extension.
29809 // Check if narrowing/widening failed.
29810 if (Mask.size() != NumSrcElts)
29813 int SrcIdx = Mask[N->getConstantOperandVal(1)];
29816 // If the shuffle source element is undef/zero then we can just accept it.
29817 if (SrcIdx == SM_SentinelUndef)
29818 return DAG.getUNDEF(VT);
29820 if (SrcIdx == SM_SentinelZero)
29821 return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
29822 : DAG.getConstant(0, dl, VT);
29824 SDValue SrcOp = Ops[SrcIdx / Mask.size()];
29825 SrcOp = DAG.getBitcast(SrcVT, SrcOp);
29826 SrcIdx = SrcIdx % Mask.size();
29828 // We can only extract other elements from 128-bit vectors and in certain
29829 // circumstances, depending on SSE-level.
29830 // TODO: Investigate using extract_subvector for larger vectors.
29831 // TODO: Investigate float/double extraction if it will be just stored.
29832 if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
29833 ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
29834 assert(SrcSVT == VT && "Unexpected extraction type");
29835 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
29836 DAG.getIntPtrConstant(SrcIdx, dl));
29839 if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
29840 (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
29841 assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
29842 "Unexpected extraction type");
29843 unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
29844 SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
29845 DAG.getIntPtrConstant(SrcIdx, dl));
29846 SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
29847 DAG.getValueType(SrcSVT));
29848 return DAG.getZExtOrTrunc(Assert, dl, VT);
29854 /// Detect vector gather/scatter index generation and convert it from being a
29855 /// bunch of shuffles and extracts into a somewhat faster sequence.
29856 /// For i686, the best sequence is apparently storing the value and loading
29857 /// scalars back, while for x64 we should use 64-bit extracts and shifts.
29858 static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
29859 TargetLowering::DAGCombinerInfo &DCI,
29860 const X86Subtarget &Subtarget) {
29861 if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
29864 if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
29867 SDValue InputVector = N->getOperand(0);
29868 SDValue EltIdx = N->getOperand(1);
29870 EVT SrcVT = InputVector.getValueType();
29871 EVT VT = N->getValueType(0);
29872 SDLoc dl(InputVector);
29874 // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
29875 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29876 VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
29877 SDValue MMXSrc = InputVector.getOperand(0);
29879 // The bitcast source is a direct mmx result.
29880 if (MMXSrc.getValueType() == MVT::x86mmx)
29881 return DAG.getBitcast(VT, InputVector);
29884 // Detect mmx to i32 conversion through a v2i32 elt extract.
29885 if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
29886 VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
29887 SDValue MMXSrc = InputVector.getOperand(0);
29889 // The bitcast source is a direct mmx result.
29890 if (MMXSrc.getValueType() == MVT::x86mmx)
29891 return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
29894 if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
29895 isa<ConstantSDNode>(EltIdx) &&
29896 isa<ConstantSDNode>(InputVector.getOperand(0))) {
29897 uint64_t ExtractedElt = N->getConstantOperandVal(1);
29898 uint64_t InputValue = InputVector.getConstantOperandVal(0);
29899 uint64_t Res = (InputValue >> ExtractedElt) & 1;
29900 return DAG.getConstant(Res, dl, MVT::i1);
29903 // Check whether this extract is the root of a sum of absolute differences
29904 // pattern. This has to be done here because we really want it to happen
29905 // pre-legalization,
29906 if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
29909 // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
29910 if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
29913 // Only operate on vectors of 4 elements, where the alternative shuffling
29914 // gets to be more expensive.
29915 if (SrcVT != MVT::v4i32)
29918 // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
29919 // single use which is a sign-extend or zero-extend, and all elements are
29921 SmallVector<SDNode *, 4> Uses;
29922 unsigned ExtractedElements = 0;
29923 for (SDNode::use_iterator UI = InputVector.getNode()->use_begin(),
29924 UE = InputVector.getNode()->use_end(); UI != UE; ++UI) {
29925 if (UI.getUse().getResNo() != InputVector.getResNo())
29928 SDNode *Extract = *UI;
29929 if (Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
29932 if (Extract->getValueType(0) != MVT::i32)
29934 if (!Extract->hasOneUse())
29936 if (Extract->use_begin()->getOpcode() != ISD::SIGN_EXTEND &&
29937 Extract->use_begin()->getOpcode() != ISD::ZERO_EXTEND)
29939 if (!isa<ConstantSDNode>(Extract->getOperand(1)))
29942 // Record which element was extracted.
29943 ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
29944 Uses.push_back(Extract);
29947 // If not all the elements were used, this may not be worthwhile.
29948 if (ExtractedElements != 15)
29951 // Ok, we've now decided to do the transformation.
29952 // If 64-bit shifts are legal, use the extract-shift sequence,
29953 // otherwise bounce the vector off the cache.
29954 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
29957 if (TLI.isOperationLegal(ISD::SRA, MVT::i64)) {
29958 SDValue Cst = DAG.getBitcast(MVT::v2i64, InputVector);
29959 auto &DL = DAG.getDataLayout();
29960 EVT VecIdxTy = DAG.getTargetLoweringInfo().getVectorIdxTy(DL);
29961 SDValue BottomHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29962 DAG.getConstant(0, dl, VecIdxTy));
29963 SDValue TopHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Cst,
29964 DAG.getConstant(1, dl, VecIdxTy));
29966 SDValue ShAmt = DAG.getConstant(
29967 32, dl, DAG.getTargetLoweringInfo().getShiftAmountTy(MVT::i64, DL));
29968 Vals[0] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BottomHalf);
29969 Vals[1] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29970 DAG.getNode(ISD::SRA, dl, MVT::i64, BottomHalf, ShAmt));
29971 Vals[2] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, TopHalf);
29972 Vals[3] = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32,
29973 DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
29975 // Store the value to a temporary stack slot.
29976 SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
29977 SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
29978 MachinePointerInfo());
29980 EVT ElementType = SrcVT.getVectorElementType();
29981 unsigned EltSize = ElementType.getSizeInBits() / 8;
29983 // Replace each use (extract) with a load of the appropriate element.
29984 for (unsigned i = 0; i < 4; ++i) {
29985 uint64_t Offset = EltSize * i;
29986 auto PtrVT = TLI.getPointerTy(DAG.getDataLayout());
29987 SDValue OffsetVal = DAG.getConstant(Offset, dl, PtrVT);
29989 SDValue ScalarAddr =
29990 DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, OffsetVal);
29992 // Load the scalar.
29994 DAG.getLoad(ElementType, dl, Ch, ScalarAddr, MachinePointerInfo());
29998 // Replace the extracts
29999 for (SmallVectorImpl<SDNode *>::iterator UI = Uses.begin(),
30000 UE = Uses.end(); UI != UE; ++UI) {
30001 SDNode *Extract = *UI;
30003 uint64_t IdxVal = Extract->getConstantOperandVal(1);
30004 DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
30007 // The replacement was made in place; don't return anything.
30011 // TODO - merge with combineExtractVectorElt once it can handle the implicit
30012 // zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
30013 // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
30014 // combineBasicSADPattern.
30015 static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
30016 TargetLowering::DAGCombinerInfo &DCI,
30017 const X86Subtarget &Subtarget) {
30018 return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
30021 /// If a vector select has an operand that is -1 or 0, try to simplify the
30022 /// select to a bitwise logic operation.
30024 combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
30025 TargetLowering::DAGCombinerInfo &DCI,
30026 const X86Subtarget &Subtarget) {
30027 SDValue Cond = N->getOperand(0);
30028 SDValue LHS = N->getOperand(1);
30029 SDValue RHS = N->getOperand(2);
30030 EVT VT = LHS.getValueType();
30031 EVT CondVT = Cond.getValueType();
30033 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30035 if (N->getOpcode() != ISD::VSELECT)
30038 assert(CondVT.isVector() && "Vector select expects a vector selector!");
30040 bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30041 // Check if the first operand is all zeros and Cond type is vXi1.
30042 // This situation only applies to avx512.
30043 if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
30044 CondVT.getVectorElementType() == MVT::i1) {
30045 // Invert the cond to not(cond) : xor(op,allones)=not(op)
30046 SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
30047 DAG.getAllOnesConstant(DL, CondVT));
30048 // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
30049 return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
30052 // To use the condition operand as a bitwise mask, it must have elements that
30053 // are the same size as the select elements. Ie, the condition operand must
30054 // have already been promoted from the IR select condition type <N x i1>.
30055 // Don't check if the types themselves are equal because that excludes
30056 // vector floating-point selects.
30057 if (CondVT.getScalarSizeInBits() != VT.getScalarSizeInBits())
30060 bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
30061 FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
30063 // Try to invert the condition if true value is not all 1s and false value is
30065 if (!TValIsAllOnes && !FValIsAllZeros &&
30066 // Check if the selector will be produced by CMPP*/PCMP*.
30067 Cond.getOpcode() == ISD::SETCC &&
30068 // Check if SETCC has already been promoted.
30069 TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
30071 bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
30072 bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
30074 if (TValIsAllZeros || FValIsAllOnes) {
30075 SDValue CC = Cond.getOperand(2);
30076 ISD::CondCode NewCC =
30077 ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
30078 Cond.getOperand(0).getValueType().isInteger());
30079 Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
30081 std::swap(LHS, RHS);
30082 TValIsAllOnes = FValIsAllOnes;
30083 FValIsAllZeros = TValIsAllZeros;
30087 // vselect Cond, 111..., 000... -> Cond
30088 if (TValIsAllOnes && FValIsAllZeros)
30089 return DAG.getBitcast(VT, Cond);
30091 if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(CondVT))
30094 // vselect Cond, 111..., X -> or Cond, X
30095 if (TValIsAllOnes) {
30096 SDValue CastRHS = DAG.getBitcast(CondVT, RHS);
30097 SDValue Or = DAG.getNode(ISD::OR, DL, CondVT, Cond, CastRHS);
30098 return DAG.getBitcast(VT, Or);
30101 // vselect Cond, X, 000... -> and Cond, X
30102 if (FValIsAllZeros) {
30103 SDValue CastLHS = DAG.getBitcast(CondVT, LHS);
30104 SDValue And = DAG.getNode(ISD::AND, DL, CondVT, Cond, CastLHS);
30105 return DAG.getBitcast(VT, And);
30111 static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
30112 SDValue Cond = N->getOperand(0);
30113 SDValue LHS = N->getOperand(1);
30114 SDValue RHS = N->getOperand(2);
30117 auto *TrueC = dyn_cast<ConstantSDNode>(LHS);
30118 auto *FalseC = dyn_cast<ConstantSDNode>(RHS);
30119 if (!TrueC || !FalseC)
30122 // Don't do this for crazy integer types.
30123 if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
30126 // If this is efficiently invertible, canonicalize the LHSC/RHSC values
30127 // so that TrueC (the true value) is larger than FalseC.
30128 bool NeedsCondInvert = false;
30129 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
30130 // Efficiently invertible.
30131 (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
30132 (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
30133 isa<ConstantSDNode>(Cond.getOperand(1))))) {
30134 NeedsCondInvert = true;
30135 std::swap(TrueC, FalseC);
30138 // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
30139 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
30140 if (NeedsCondInvert) // Invert the condition if needed.
30141 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30142 DAG.getConstant(1, DL, Cond.getValueType()));
30144 // Zero extend the condition if needed.
30145 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
30147 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
30148 return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
30149 DAG.getConstant(ShAmt, DL, MVT::i8));
30152 // Optimize cases that will turn into an LEA instruction. This requires
30153 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
30154 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
30155 uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
30156 if (N->getValueType(0) == MVT::i32)
30157 Diff = (unsigned)Diff;
30159 bool IsFastMultiplier = false;
30161 switch ((unsigned char)Diff) {
30164 case 1: // result = add base, cond
30165 case 2: // result = lea base( , cond*2)
30166 case 3: // result = lea base(cond, cond*2)
30167 case 4: // result = lea base( , cond*4)
30168 case 5: // result = lea base(cond, cond*4)
30169 case 8: // result = lea base( , cond*8)
30170 case 9: // result = lea base(cond, cond*8)
30171 IsFastMultiplier = true;
30176 if (IsFastMultiplier) {
30177 APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
30178 if (NeedsCondInvert) // Invert the condition if needed.
30179 Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
30180 DAG.getConstant(1, DL, Cond.getValueType()));
30182 // Zero extend the condition if needed.
30183 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
30184 // Scale the condition by the difference.
30186 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
30187 DAG.getConstant(Diff, DL, Cond.getValueType()));
30189 // Add the base if non-zero.
30190 if (FalseC->getAPIntValue() != 0)
30191 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
30192 SDValue(FalseC, 0));
30200 // If this is a bitcasted op that can be represented as another type, push the
30201 // the bitcast to the inputs. This allows more opportunities for pattern
30202 // matching masked instructions. This is called when we know that the operation
30203 // is used as one of the inputs of a vselect.
30204 static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
30205 TargetLowering::DAGCombinerInfo &DCI) {
30206 // Make sure we have a bitcast.
30207 if (OrigOp.getOpcode() != ISD::BITCAST)
30210 SDValue Op = OrigOp.getOperand(0);
30212 // If the operation is used by anything other than the bitcast, we shouldn't
30213 // do this combine as that would replicate the operation.
30214 if (!Op.hasOneUse())
30217 MVT VT = OrigOp.getSimpleValueType();
30218 MVT EltVT = VT.getVectorElementType();
30219 SDLoc DL(Op.getNode());
30221 auto BitcastAndCombineShuffle = [&](unsigned Opcode, SDValue Op0, SDValue Op1,
30223 Op0 = DAG.getBitcast(VT, Op0);
30224 DCI.AddToWorklist(Op0.getNode());
30225 Op1 = DAG.getBitcast(VT, Op1);
30226 DCI.AddToWorklist(Op1.getNode());
30227 DCI.CombineTo(OrigOp.getNode(),
30228 DAG.getNode(Opcode, DL, VT, Op0, Op1, Op2));
30232 unsigned Opcode = Op.getOpcode();
30234 case X86ISD::PALIGNR:
30235 // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
30236 if (!VT.is128BitVector())
30238 Opcode = X86ISD::VALIGN;
30240 case X86ISD::VALIGN: {
30241 if (EltVT != MVT::i32 && EltVT != MVT::i64)
30243 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30244 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30245 unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
30246 unsigned EltSize = EltVT.getSizeInBits();
30247 // Make sure we can represent the same shift with the new VT.
30248 if ((ShiftAmt % EltSize) != 0)
30250 Imm = ShiftAmt / EltSize;
30251 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30252 DAG.getConstant(Imm, DL, MVT::i8));
30254 case X86ISD::SHUF128: {
30255 if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
30257 // Only change element size, not type.
30258 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30260 return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
30263 case ISD::INSERT_SUBVECTOR: {
30264 unsigned EltSize = EltVT.getSizeInBits();
30265 if (EltSize != 32 && EltSize != 64)
30267 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30268 // Only change element size, not type.
30269 if (EltVT.isInteger() != OpEltVT.isInteger())
30271 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
30272 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30273 SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
30274 DCI.AddToWorklist(Op0.getNode());
30275 // Op1 needs to be bitcasted to a smaller vector with the same element type.
30276 SDValue Op1 = Op.getOperand(1);
30277 MVT Op1VT = MVT::getVectorVT(EltVT,
30278 Op1.getSimpleValueType().getSizeInBits() / EltSize);
30279 Op1 = DAG.getBitcast(Op1VT, Op1);
30280 DCI.AddToWorklist(Op1.getNode());
30281 DCI.CombineTo(OrigOp.getNode(),
30282 DAG.getNode(Opcode, DL, VT, Op0, Op1,
30283 DAG.getIntPtrConstant(Imm, DL)));
30286 case ISD::EXTRACT_SUBVECTOR: {
30287 unsigned EltSize = EltVT.getSizeInBits();
30288 if (EltSize != 32 && EltSize != 64)
30290 MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
30291 // Only change element size, not type.
30292 if (EltVT.isInteger() != OpEltVT.isInteger())
30294 uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
30295 Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
30296 // Op0 needs to be bitcasted to a larger vector with the same element type.
30297 SDValue Op0 = Op.getOperand(0);
30298 MVT Op0VT = MVT::getVectorVT(EltVT,
30299 Op0.getSimpleValueType().getSizeInBits() / EltSize);
30300 Op0 = DAG.getBitcast(Op0VT, Op0);
30301 DCI.AddToWorklist(Op0.getNode());
30302 DCI.CombineTo(OrigOp.getNode(),
30303 DAG.getNode(Opcode, DL, VT, Op0,
30304 DAG.getIntPtrConstant(Imm, DL)));
30307 case X86ISD::SUBV_BROADCAST: {
30308 unsigned EltSize = EltVT.getSizeInBits();
30309 if (EltSize != 32 && EltSize != 64)
30311 // Only change element size, not type.
30312 if (VT.isInteger() != Op.getSimpleValueType().isInteger())
30314 SDValue Op0 = Op.getOperand(0);
30315 MVT Op0VT = MVT::getVectorVT(EltVT,
30316 Op0.getSimpleValueType().getSizeInBits() / EltSize);
30317 Op0 = DAG.getBitcast(Op0VT, Op.getOperand(0));
30318 DCI.AddToWorklist(Op0.getNode());
30319 DCI.CombineTo(OrigOp.getNode(),
30320 DAG.getNode(Opcode, DL, VT, Op0));
30328 /// Do target-specific dag combines on SELECT and VSELECT nodes.
30329 static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
30330 TargetLowering::DAGCombinerInfo &DCI,
30331 const X86Subtarget &Subtarget) {
30333 SDValue Cond = N->getOperand(0);
30334 // Get the LHS/RHS of the select.
30335 SDValue LHS = N->getOperand(1);
30336 SDValue RHS = N->getOperand(2);
30337 EVT VT = LHS.getValueType();
30338 EVT CondVT = Cond.getValueType();
30339 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
30341 // If we have SSE[12] support, try to form min/max nodes. SSE min/max
30342 // instructions match the semantics of the common C idiom x<y?x:y but not
30343 // x<=y?x:y, because of how they handle negative zero (which can be
30344 // ignored in unsafe-math mode).
30345 // We also try to create v2f32 min/max nodes, which we later widen to v4f32.
30346 if (Cond.getOpcode() == ISD::SETCC && VT.isFloatingPoint() &&
30347 VT != MVT::f80 && VT != MVT::f128 &&
30348 (TLI.isTypeLegal(VT) || VT == MVT::v2f32) &&
30349 (Subtarget.hasSSE2() ||
30350 (Subtarget.hasSSE1() && VT.getScalarType() == MVT::f32))) {
30351 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30353 unsigned Opcode = 0;
30354 // Check for x CC y ? x : y.
30355 if (DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30356 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30360 // Converting this to a min would handle NaNs incorrectly, and swapping
30361 // the operands would cause it to handle comparisons between positive
30362 // and negative zero incorrectly.
30363 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30364 if (!DAG.getTarget().Options.UnsafeFPMath &&
30365 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30367 std::swap(LHS, RHS);
30369 Opcode = X86ISD::FMIN;
30372 // Converting this to a min would handle comparisons between positive
30373 // and negative zero incorrectly.
30374 if (!DAG.getTarget().Options.UnsafeFPMath &&
30375 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30377 Opcode = X86ISD::FMIN;
30380 // Converting this to a min would handle both negative zeros and NaNs
30381 // incorrectly, but we can swap the operands to fix both.
30382 std::swap(LHS, RHS);
30387 Opcode = X86ISD::FMIN;
30391 // Converting this to a max would handle comparisons between positive
30392 // and negative zero incorrectly.
30393 if (!DAG.getTarget().Options.UnsafeFPMath &&
30394 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS))
30396 Opcode = X86ISD::FMAX;
30399 // Converting this to a max would handle NaNs incorrectly, and swapping
30400 // the operands would cause it to handle comparisons between positive
30401 // and negative zero incorrectly.
30402 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
30403 if (!DAG.getTarget().Options.UnsafeFPMath &&
30404 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS)))
30406 std::swap(LHS, RHS);
30408 Opcode = X86ISD::FMAX;
30411 // Converting this to a max would handle both negative zeros and NaNs
30412 // incorrectly, but we can swap the operands to fix both.
30413 std::swap(LHS, RHS);
30418 Opcode = X86ISD::FMAX;
30421 // Check for x CC y ? y : x -- a min/max with reversed arms.
30422 } else if (DAG.isEqualTo(LHS, Cond.getOperand(1)) &&
30423 DAG.isEqualTo(RHS, Cond.getOperand(0))) {
30427 // Converting this to a min would handle comparisons between positive
30428 // and negative zero incorrectly, and swapping the operands would
30429 // cause it to handle NaNs incorrectly.
30430 if (!DAG.getTarget().Options.UnsafeFPMath &&
30431 !(DAG.isKnownNeverZero(LHS) || DAG.isKnownNeverZero(RHS))) {
30432 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30434 std::swap(LHS, RHS);
30436 Opcode = X86ISD::FMIN;
30439 // Converting this to a min would handle NaNs incorrectly.
30440 if (!DAG.getTarget().Options.UnsafeFPMath &&
30441 (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
30443 Opcode = X86ISD::FMIN;
30446 // Converting this to a min would handle both negative zeros and NaNs
30447 // incorrectly, but we can swap the operands to fix both.
30448 std::swap(LHS, RHS);
30453 Opcode = X86ISD::FMIN;
30457 // Converting this to a max would handle NaNs incorrectly.
30458 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30460 Opcode = X86ISD::FMAX;
30463 // Converting this to a max would handle comparisons between positive
30464 // and negative zero incorrectly, and swapping the operands would
30465 // cause it to handle NaNs incorrectly.
30466 if (!DAG.getTarget().Options.UnsafeFPMath &&
30467 !DAG.isKnownNeverZero(LHS) && !DAG.isKnownNeverZero(RHS)) {
30468 if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
30470 std::swap(LHS, RHS);
30472 Opcode = X86ISD::FMAX;
30475 // Converting this to a max would handle both negative zeros and NaNs
30476 // incorrectly, but we can swap the operands to fix both.
30477 std::swap(LHS, RHS);
30482 Opcode = X86ISD::FMAX;
30488 return DAG.getNode(Opcode, DL, N->getValueType(0), LHS, RHS);
30491 // v16i8 (select v16i1, v16i8, v16i8) does not have a proper
30492 // lowering on KNL. In this case we convert it to
30493 // v16i8 (select v16i8, v16i8, v16i8) and use AVX instruction.
30494 // The same situation for all 128 and 256-bit vectors of i8 and i16.
30495 // Since SKX these selects have a proper lowering.
30496 if (Subtarget.hasAVX512() && CondVT.isVector() &&
30497 CondVT.getVectorElementType() == MVT::i1 &&
30498 (VT.is128BitVector() || VT.is256BitVector()) &&
30499 (VT.getVectorElementType() == MVT::i8 ||
30500 VT.getVectorElementType() == MVT::i16) &&
30501 !(Subtarget.hasBWI() && Subtarget.hasVLX())) {
30502 Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
30503 DCI.AddToWorklist(Cond.getNode());
30504 return DAG.getNode(N->getOpcode(), DL, VT, Cond, LHS, RHS);
30507 if (SDValue V = combineSelectOfTwoConstants(N, DAG))
30510 // Canonicalize max and min:
30511 // (x > y) ? x : y -> (x >= y) ? x : y
30512 // (x < y) ? x : y -> (x <= y) ? x : y
30513 // This allows use of COND_S / COND_NS (see TranslateX86CC) which eliminates
30514 // the need for an extra compare
30515 // against zero. e.g.
30516 // (x - y) > 0 : (x - y) ? 0 -> (x - y) >= 0 : (x - y) ? 0
30518 // testl %edi, %edi
30520 // cmovgl %edi, %eax
30524 // cmovsl %eax, %edi
30525 if (N->getOpcode() == ISD::SELECT && Cond.getOpcode() == ISD::SETCC &&
30526 DAG.isEqualTo(LHS, Cond.getOperand(0)) &&
30527 DAG.isEqualTo(RHS, Cond.getOperand(1))) {
30528 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30533 ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE;
30534 Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(),
30535 Cond.getOperand(0), Cond.getOperand(1), NewCC);
30536 return DAG.getSelect(DL, VT, Cond, LHS, RHS);
30541 // Early exit check
30542 if (!TLI.isTypeLegal(VT))
30545 // Match VSELECTs into subs with unsigned saturation.
30546 if (N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
30547 // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
30548 ((Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
30549 (Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
30550 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
30552 // Check if one of the arms of the VSELECT is a zero vector. If it's on the
30553 // left side invert the predicate to simplify logic below.
30555 if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
30557 CC = ISD::getSetCCInverse(CC, true);
30558 } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
30562 if (Other.getNode() && Other->getNumOperands() == 2 &&
30563 DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
30564 SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
30565 SDValue CondRHS = Cond->getOperand(1);
30567 // Look for a general sub with unsigned saturation first.
30568 // x >= y ? x-y : 0 --> subus x, y
30569 // x > y ? x-y : 0 --> subus x, y
30570 if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
30571 Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
30572 return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
30574 if (auto *OpRHSBV = dyn_cast<BuildVectorSDNode>(OpRHS))
30575 if (auto *OpRHSConst = OpRHSBV->getConstantSplatNode()) {
30576 if (auto *CondRHSBV = dyn_cast<BuildVectorSDNode>(CondRHS))
30577 if (auto *CondRHSConst = CondRHSBV->getConstantSplatNode())
30578 // If the RHS is a constant we have to reverse the const
30579 // canonicalization.
30580 // x > C-1 ? x+-C : 0 --> subus x, C
30581 if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
30582 CondRHSConst->getAPIntValue() ==
30583 (-OpRHSConst->getAPIntValue() - 1))
30584 return DAG.getNode(
30585 X86ISD::SUBUS, DL, VT, OpLHS,
30586 DAG.getConstant(-OpRHSConst->getAPIntValue(), DL, VT));
30588 // Another special case: If C was a sign bit, the sub has been
30589 // canonicalized into a xor.
30590 // FIXME: Would it be better to use computeKnownBits to determine
30591 // whether it's safe to decanonicalize the xor?
30592 // x s< 0 ? x^C : 0 --> subus x, C
30593 if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
30594 ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
30595 OpRHSConst->getAPIntValue().isSignMask())
30596 // Note that we have to rebuild the RHS constant here to ensure we
30597 // don't rely on particular values of undef lanes.
30598 return DAG.getNode(
30599 X86ISD::SUBUS, DL, VT, OpLHS,
30600 DAG.getConstant(OpRHSConst->getAPIntValue(), DL, VT));
30605 if (SDValue V = combineVSelectWithAllOnesOrZeros(N, DAG, DCI, Subtarget))
30608 // If this is a *dynamic* select (non-constant condition) and we can match
30609 // this node with one of the variable blend instructions, restructure the
30610 // condition so that blends can use the high (sign) bit of each element and
30611 // use SimplifyDemandedBits to simplify the condition operand.
30612 if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
30613 !DCI.isBeforeLegalize() &&
30614 !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
30615 unsigned BitWidth = Cond.getScalarValueSizeInBits();
30617 // Don't optimize vector selects that map to mask-registers.
30621 // We can only handle the cases where VSELECT is directly legal on the
30622 // subtarget. We custom lower VSELECT nodes with constant conditions and
30623 // this makes it hard to see whether a dynamic VSELECT will correctly
30624 // lower, so we both check the operation's status and explicitly handle the
30625 // cases where a *dynamic* blend will fail even though a constant-condition
30626 // blend could be custom lowered.
30627 // FIXME: We should find a better way to handle this class of problems.
30628 // Potentially, we should combine constant-condition vselect nodes
30629 // pre-legalization into shuffles and not mark as many types as custom
30631 if (!TLI.isOperationLegalOrCustom(ISD::VSELECT, VT))
30633 // FIXME: We don't support i16-element blends currently. We could and
30634 // should support them by making *all* the bits in the condition be set
30635 // rather than just the high bit and using an i8-element blend.
30636 if (VT.getVectorElementType() == MVT::i16)
30638 // Dynamic blending was only available from SSE4.1 onward.
30639 if (VT.is128BitVector() && !Subtarget.hasSSE41())
30641 // Byte blends are only available in AVX2
30642 if (VT == MVT::v32i8 && !Subtarget.hasAVX2())
30644 // There are no 512-bit blend instructions that use sign bits.
30645 if (VT.is512BitVector())
30648 assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
30649 APInt DemandedMask(APInt::getSignMask(BitWidth));
30651 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
30652 !DCI.isBeforeLegalizeOps());
30653 if (TLI.ShrinkDemandedConstant(Cond, DemandedMask, TLO) ||
30654 TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO)) {
30655 // If we changed the computation somewhere in the DAG, this change will
30656 // affect all users of Cond. Make sure it is fine and update all the nodes
30657 // so that we do not use the generic VSELECT anymore. Otherwise, we may
30658 // perform wrong optimizations as we messed with the actual expectation
30659 // for the vector boolean values.
30660 if (Cond != TLO.Old) {
30661 // Check all uses of the condition operand to check whether it will be
30662 // consumed by non-BLEND instructions. Those may require that all bits
30663 // are set properly.
30664 for (SDNode *U : Cond->uses()) {
30665 // TODO: Add other opcodes eventually lowered into BLEND.
30666 if (U->getOpcode() != ISD::VSELECT)
30670 // Update all users of the condition before committing the change, so
30671 // that the VSELECT optimizations that expect the correct vector boolean
30672 // value will not be triggered.
30673 for (SDNode *U : Cond->uses()) {
30674 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
30675 U->getValueType(0), Cond, U->getOperand(1),
30677 DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
30679 DCI.CommitTargetLoweringOpt(TLO);
30682 // Only Cond (rather than other nodes in the computation chain) was
30683 // changed. Change the condition just for N to keep the opportunity to
30684 // optimize all other users their own way.
30685 SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
30686 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
30691 // Look for vselects with LHS/RHS being bitcasted from an operation that
30692 // can be executed on another type. Push the bitcast to the inputs of
30693 // the operation. This exposes opportunities for using masking instructions.
30694 if (N->getOpcode() == ISD::VSELECT && DCI.isAfterLegalizeVectorOps() &&
30695 CondVT.getVectorElementType() == MVT::i1) {
30696 if (combineBitcastForMaskedOp(LHS, DAG, DCI))
30697 return SDValue(N, 0);
30698 if (combineBitcastForMaskedOp(RHS, DAG, DCI))
30699 return SDValue(N, 0);
30702 // Custom action for SELECT MMX
30703 if (VT == MVT::x86mmx) {
30704 LHS = DAG.getBitcast(MVT::i64, LHS);
30705 RHS = DAG.getBitcast(MVT::i64, RHS);
30706 SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
30707 return DAG.getBitcast(VT, newSelect);
30714 /// (brcond/cmov/setcc .., (cmp (atomic_load_add x, 1), 0), COND_S)
30716 /// (brcond/cmov/setcc .., (LADD x, 1), COND_LE)
30717 /// i.e., reusing the EFLAGS produced by the LOCKed instruction.
30718 /// Note that this is only legal for some op/cc combinations.
30719 static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
30720 SelectionDAG &DAG) {
30721 // This combine only operates on CMP-like nodes.
30722 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30723 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30726 // Can't replace the cmp if it has more uses than the one we're looking at.
30727 // FIXME: We would like to be able to handle this, but would need to make sure
30728 // all uses were updated.
30729 if (!Cmp.hasOneUse())
30732 // This only applies to variations of the common case:
30733 // (icmp slt x, 0) -> (icmp sle (add x, 1), 0)
30734 // (icmp sge x, 0) -> (icmp sgt (add x, 1), 0)
30735 // (icmp sle x, 0) -> (icmp slt (sub x, 1), 0)
30736 // (icmp sgt x, 0) -> (icmp sge (sub x, 1), 0)
30737 // Using the proper condcodes (see below), overflow is checked for.
30739 // FIXME: We can generalize both constraints:
30740 // - XOR/OR/AND (if they were made to survive AtomicExpand)
30742 // if the result is compared.
30744 SDValue CmpLHS = Cmp.getOperand(0);
30745 SDValue CmpRHS = Cmp.getOperand(1);
30747 if (!CmpLHS.hasOneUse())
30750 auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
30751 if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
30754 const unsigned Opc = CmpLHS.getOpcode();
30756 if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
30759 SDValue OpRHS = CmpLHS.getOperand(2);
30760 auto *OpRHSC = dyn_cast<ConstantSDNode>(OpRHS);
30764 APInt Addend = OpRHSC->getAPIntValue();
30765 if (Opc == ISD::ATOMIC_LOAD_SUB)
30768 if (CC == X86::COND_S && Addend == 1)
30770 else if (CC == X86::COND_NS && Addend == 1)
30772 else if (CC == X86::COND_G && Addend == -1)
30774 else if (CC == X86::COND_LE && Addend == -1)
30779 SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
30780 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
30781 DAG.getUNDEF(CmpLHS.getValueType()));
30782 DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
30786 // Check whether a boolean test is testing a boolean value generated by
30787 // X86ISD::SETCC. If so, return the operand of that SETCC and proper condition
30790 // Simplify the following patterns:
30791 // (Op (CMP (SETCC Cond EFLAGS) 1) EQ) or
30792 // (Op (CMP (SETCC Cond EFLAGS) 0) NEQ)
30793 // to (Op EFLAGS Cond)
30795 // (Op (CMP (SETCC Cond EFLAGS) 0) EQ) or
30796 // (Op (CMP (SETCC Cond EFLAGS) 1) NEQ)
30797 // to (Op EFLAGS !Cond)
30799 // where Op could be BRCOND or CMOV.
30801 static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
30802 // This combine only operates on CMP-like nodes.
30803 if (!(Cmp.getOpcode() == X86ISD::CMP ||
30804 (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
30807 // Quit if not used as a boolean value.
30808 if (CC != X86::COND_E && CC != X86::COND_NE)
30811 // Check CMP operands. One of them should be 0 or 1 and the other should be
30812 // an SetCC or extended from it.
30813 SDValue Op1 = Cmp.getOperand(0);
30814 SDValue Op2 = Cmp.getOperand(1);
30817 const ConstantSDNode* C = nullptr;
30818 bool needOppositeCond = (CC == X86::COND_E);
30819 bool checkAgainstTrue = false; // Is it a comparison against 1?
30821 if ((C = dyn_cast<ConstantSDNode>(Op1)))
30823 else if ((C = dyn_cast<ConstantSDNode>(Op2)))
30825 else // Quit if all operands are not constants.
30828 if (C->getZExtValue() == 1) {
30829 needOppositeCond = !needOppositeCond;
30830 checkAgainstTrue = true;
30831 } else if (C->getZExtValue() != 0)
30832 // Quit if the constant is neither 0 or 1.
30835 bool truncatedToBoolWithAnd = false;
30836 // Skip (zext $x), (trunc $x), or (and $x, 1) node.
30837 while (SetCC.getOpcode() == ISD::ZERO_EXTEND ||
30838 SetCC.getOpcode() == ISD::TRUNCATE ||
30839 SetCC.getOpcode() == ISD::AND) {
30840 if (SetCC.getOpcode() == ISD::AND) {
30842 if (isOneConstant(SetCC.getOperand(0)))
30844 if (isOneConstant(SetCC.getOperand(1)))
30848 SetCC = SetCC.getOperand(OpIdx);
30849 truncatedToBoolWithAnd = true;
30851 SetCC = SetCC.getOperand(0);
30854 switch (SetCC.getOpcode()) {
30855 case X86ISD::SETCC_CARRY:
30856 // Since SETCC_CARRY gives output based on R = CF ? ~0 : 0, it's unsafe to
30857 // simplify it if the result of SETCC_CARRY is not canonicalized to 0 or 1,
30858 // i.e. it's a comparison against true but the result of SETCC_CARRY is not
30859 // truncated to i1 using 'and'.
30860 if (checkAgainstTrue && !truncatedToBoolWithAnd)
30862 assert(X86::CondCode(SetCC.getConstantOperandVal(0)) == X86::COND_B &&
30863 "Invalid use of SETCC_CARRY!");
30865 case X86ISD::SETCC:
30866 // Set the condition code or opposite one if necessary.
30867 CC = X86::CondCode(SetCC.getConstantOperandVal(0));
30868 if (needOppositeCond)
30869 CC = X86::GetOppositeBranchCondition(CC);
30870 return SetCC.getOperand(1);
30871 case X86ISD::CMOV: {
30872 // Check whether false/true value has canonical one, i.e. 0 or 1.
30873 ConstantSDNode *FVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(0));
30874 ConstantSDNode *TVal = dyn_cast<ConstantSDNode>(SetCC.getOperand(1));
30875 // Quit if true value is not a constant.
30878 // Quit if false value is not a constant.
30880 SDValue Op = SetCC.getOperand(0);
30881 // Skip 'zext' or 'trunc' node.
30882 if (Op.getOpcode() == ISD::ZERO_EXTEND ||
30883 Op.getOpcode() == ISD::TRUNCATE)
30884 Op = Op.getOperand(0);
30885 // A special case for rdrand/rdseed, where 0 is set if false cond is
30887 if ((Op.getOpcode() != X86ISD::RDRAND &&
30888 Op.getOpcode() != X86ISD::RDSEED) || Op.getResNo() != 0)
30891 // Quit if false value is not the constant 0 or 1.
30892 bool FValIsFalse = true;
30893 if (FVal && FVal->getZExtValue() != 0) {
30894 if (FVal->getZExtValue() != 1)
30896 // If FVal is 1, opposite cond is needed.
30897 needOppositeCond = !needOppositeCond;
30898 FValIsFalse = false;
30900 // Quit if TVal is not the constant opposite of FVal.
30901 if (FValIsFalse && TVal->getZExtValue() != 1)
30903 if (!FValIsFalse && TVal->getZExtValue() != 0)
30905 CC = X86::CondCode(SetCC.getConstantOperandVal(2));
30906 if (needOppositeCond)
30907 CC = X86::GetOppositeBranchCondition(CC);
30908 return SetCC.getOperand(3);
30915 /// Check whether Cond is an AND/OR of SETCCs off of the same EFLAGS.
30917 /// (X86or (X86setcc) (X86setcc))
30918 /// (X86cmp (and (X86setcc) (X86setcc)), 0)
30919 static bool checkBoolTestAndOrSetCCCombine(SDValue Cond, X86::CondCode &CC0,
30920 X86::CondCode &CC1, SDValue &Flags,
30922 if (Cond->getOpcode() == X86ISD::CMP) {
30923 if (!isNullConstant(Cond->getOperand(1)))
30926 Cond = Cond->getOperand(0);
30931 SDValue SetCC0, SetCC1;
30932 switch (Cond->getOpcode()) {
30933 default: return false;
30940 SetCC0 = Cond->getOperand(0);
30941 SetCC1 = Cond->getOperand(1);
30945 // Make sure we have SETCC nodes, using the same flags value.
30946 if (SetCC0.getOpcode() != X86ISD::SETCC ||
30947 SetCC1.getOpcode() != X86ISD::SETCC ||
30948 SetCC0->getOperand(1) != SetCC1->getOperand(1))
30951 CC0 = (X86::CondCode)SetCC0->getConstantOperandVal(0);
30952 CC1 = (X86::CondCode)SetCC1->getConstantOperandVal(0);
30953 Flags = SetCC0->getOperand(1);
30957 // When legalizing carry, we create carries via add X, -1
30958 // If that comes from an actual carry, via setcc, we use the
30960 static SDValue combineCarryThroughADD(SDValue EFLAGS) {
30961 if (EFLAGS.getOpcode() == X86ISD::ADD) {
30962 if (isAllOnesConstant(EFLAGS.getOperand(1))) {
30963 SDValue Carry = EFLAGS.getOperand(0);
30964 while (Carry.getOpcode() == ISD::TRUNCATE ||
30965 Carry.getOpcode() == ISD::ZERO_EXTEND ||
30966 Carry.getOpcode() == ISD::SIGN_EXTEND ||
30967 Carry.getOpcode() == ISD::ANY_EXTEND ||
30968 (Carry.getOpcode() == ISD::AND &&
30969 isOneConstant(Carry.getOperand(1))))
30970 Carry = Carry.getOperand(0);
30971 if (Carry.getOpcode() == X86ISD::SETCC ||
30972 Carry.getOpcode() == X86ISD::SETCC_CARRY) {
30973 if (Carry.getConstantOperandVal(0) == X86::COND_B)
30974 return Carry.getOperand(1);
30982 /// Optimize an EFLAGS definition used according to the condition code \p CC
30983 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
30984 /// uses of chain values.
30985 static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
30986 SelectionDAG &DAG) {
30987 if (CC == X86::COND_B)
30988 if (SDValue Flags = combineCarryThroughADD(EFLAGS))
30991 if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
30993 return combineSetCCAtomicArith(EFLAGS, CC, DAG);
30996 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
30997 static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
30998 TargetLowering::DAGCombinerInfo &DCI,
30999 const X86Subtarget &Subtarget) {
31002 // If the flag operand isn't dead, don't touch this CMOV.
31003 if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
31006 SDValue FalseOp = N->getOperand(0);
31007 SDValue TrueOp = N->getOperand(1);
31008 X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
31009 SDValue Cond = N->getOperand(3);
31011 if (CC == X86::COND_E || CC == X86::COND_NE) {
31012 switch (Cond.getOpcode()) {
31016 // If operand of BSR / BSF are proven never zero, then ZF cannot be set.
31017 if (DAG.isKnownNeverZero(Cond.getOperand(0)))
31018 return (CC == X86::COND_E) ? FalseOp : TrueOp;
31022 // Try to simplify the EFLAGS and condition code operands.
31023 // We can't always do this as FCMOV only supports a subset of X86 cond.
31024 if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
31025 if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
31026 SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
31028 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31032 // If this is a select between two integer constants, try to do some
31033 // optimizations. Note that the operands are ordered the opposite of SELECT
31035 if (ConstantSDNode *TrueC = dyn_cast<ConstantSDNode>(TrueOp)) {
31036 if (ConstantSDNode *FalseC = dyn_cast<ConstantSDNode>(FalseOp)) {
31037 // Canonicalize the TrueC/FalseC values so that TrueC (the true value) is
31038 // larger than FalseC (the false value).
31039 if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue())) {
31040 CC = X86::GetOppositeBranchCondition(CC);
31041 std::swap(TrueC, FalseC);
31042 std::swap(TrueOp, FalseOp);
31045 // Optimize C ? 8 : 0 -> zext(setcc(C)) << 3. Likewise for any pow2/0.
31046 // This is efficient for any integer data type (including i8/i16) and
31048 if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
31049 Cond = getSETCC(CC, Cond, DL, DAG);
31051 // Zero extend the condition if needed.
31052 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, TrueC->getValueType(0), Cond);
31054 unsigned ShAmt = TrueC->getAPIntValue().logBase2();
31055 Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
31056 DAG.getConstant(ShAmt, DL, MVT::i8));
31057 if (N->getNumValues() == 2) // Dead flag value?
31058 return DCI.CombineTo(N, Cond, SDValue());
31062 // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst. This is efficient
31063 // for any integer data type, including i8/i16.
31064 if (FalseC->getAPIntValue()+1 == TrueC->getAPIntValue()) {
31065 Cond = getSETCC(CC, Cond, DL, DAG);
31067 // Zero extend the condition if needed.
31068 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL,
31069 FalseC->getValueType(0), Cond);
31070 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31071 SDValue(FalseC, 0));
31073 if (N->getNumValues() == 2) // Dead flag value?
31074 return DCI.CombineTo(N, Cond, SDValue());
31078 // Optimize cases that will turn into an LEA instruction. This requires
31079 // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
31080 if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
31081 uint64_t Diff = TrueC->getZExtValue()-FalseC->getZExtValue();
31082 if (N->getValueType(0) == MVT::i32) Diff = (unsigned)Diff;
31084 bool isFastMultiplier = false;
31086 switch ((unsigned char)Diff) {
31088 case 1: // result = add base, cond
31089 case 2: // result = lea base( , cond*2)
31090 case 3: // result = lea base(cond, cond*2)
31091 case 4: // result = lea base( , cond*4)
31092 case 5: // result = lea base(cond, cond*4)
31093 case 8: // result = lea base( , cond*8)
31094 case 9: // result = lea base(cond, cond*8)
31095 isFastMultiplier = true;
31100 if (isFastMultiplier) {
31101 APInt Diff = TrueC->getAPIntValue()-FalseC->getAPIntValue();
31102 Cond = getSETCC(CC, Cond, DL ,DAG);
31103 // Zero extend the condition if needed.
31104 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0),
31106 // Scale the condition by the difference.
31108 Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
31109 DAG.getConstant(Diff, DL, Cond.getValueType()));
31111 // Add the base if non-zero.
31112 if (FalseC->getAPIntValue() != 0)
31113 Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
31114 SDValue(FalseC, 0));
31115 if (N->getNumValues() == 2) // Dead flag value?
31116 return DCI.CombineTo(N, Cond, SDValue());
31123 // Handle these cases:
31124 // (select (x != c), e, c) -> select (x != c), e, x),
31125 // (select (x == c), c, e) -> select (x == c), x, e)
31126 // where the c is an integer constant, and the "select" is the combination
31127 // of CMOV and CMP.
31129 // The rationale for this change is that the conditional-move from a constant
31130 // needs two instructions, however, conditional-move from a register needs
31131 // only one instruction.
31133 // CAVEAT: By replacing a constant with a symbolic value, it may obscure
31134 // some instruction-combining opportunities. This opt needs to be
31135 // postponed as late as possible.
31137 if (!DCI.isBeforeLegalize() && !DCI.isBeforeLegalizeOps()) {
31138 // the DCI.xxxx conditions are provided to postpone the optimization as
31139 // late as possible.
31141 ConstantSDNode *CmpAgainst = nullptr;
31142 if ((Cond.getOpcode() == X86ISD::CMP || Cond.getOpcode() == X86ISD::SUB) &&
31143 (CmpAgainst = dyn_cast<ConstantSDNode>(Cond.getOperand(1))) &&
31144 !isa<ConstantSDNode>(Cond.getOperand(0))) {
31146 if (CC == X86::COND_NE &&
31147 CmpAgainst == dyn_cast<ConstantSDNode>(FalseOp)) {
31148 CC = X86::GetOppositeBranchCondition(CC);
31149 std::swap(TrueOp, FalseOp);
31152 if (CC == X86::COND_E &&
31153 CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
31154 SDValue Ops[] = { FalseOp, Cond.getOperand(0),
31155 DAG.getConstant(CC, DL, MVT::i8), Cond };
31156 return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
31161 // Fold and/or of setcc's to double CMOV:
31162 // (CMOV F, T, ((cc1 | cc2) != 0)) -> (CMOV (CMOV F, T, cc1), T, cc2)
31163 // (CMOV F, T, ((cc1 & cc2) != 0)) -> (CMOV (CMOV T, F, !cc1), F, !cc2)
31165 // This combine lets us generate:
31166 // cmovcc1 (jcc1 if we don't have CMOV)
31172 // cmovne (jne if we don't have CMOV)
31173 // When we can't use the CMOV instruction, it might increase branch
31175 // When we can use CMOV, or when there is no mispredict, this improves
31176 // throughput and reduces register pressure.
31178 if (CC == X86::COND_NE) {
31180 X86::CondCode CC0, CC1;
31182 if (checkBoolTestAndOrSetCCCombine(Cond, CC0, CC1, Flags, isAndSetCC)) {
31184 std::swap(FalseOp, TrueOp);
31185 CC0 = X86::GetOppositeBranchCondition(CC0);
31186 CC1 = X86::GetOppositeBranchCondition(CC1);
31189 SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
31191 SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
31192 SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
31193 SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
31194 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
31202 /// Different mul shrinking modes.
31203 enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
31205 static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
31206 EVT VT = N->getOperand(0).getValueType();
31207 if (VT.getScalarSizeInBits() != 32)
31210 assert(N->getNumOperands() == 2 && "NumOperands of Mul are 2");
31211 unsigned SignBits[2] = {1, 1};
31212 bool IsPositive[2] = {false, false};
31213 for (unsigned i = 0; i < 2; i++) {
31214 SDValue Opd = N->getOperand(i);
31216 // DAG.ComputeNumSignBits return 1 for ISD::ANY_EXTEND, so we need to
31217 // compute signbits for it separately.
31218 if (Opd.getOpcode() == ISD::ANY_EXTEND) {
31219 // For anyextend, it is safe to assume an appropriate number of leading
31221 if (Opd.getOperand(0).getValueType().getVectorElementType() == MVT::i8)
31223 else if (Opd.getOperand(0).getValueType().getVectorElementType() ==
31228 IsPositive[i] = true;
31229 } else if (Opd.getOpcode() == ISD::BUILD_VECTOR) {
31230 // All the operands of BUILD_VECTOR need to be int constant.
31231 // Find the smallest value range which all the operands belong to.
31233 IsPositive[i] = true;
31234 for (const SDValue &SubOp : Opd.getNode()->op_values()) {
31235 if (SubOp.isUndef())
31237 auto *CN = dyn_cast<ConstantSDNode>(SubOp);
31240 APInt IntVal = CN->getAPIntValue();
31241 if (IntVal.isNegative())
31242 IsPositive[i] = false;
31243 SignBits[i] = std::min(SignBits[i], IntVal.getNumSignBits());
31246 SignBits[i] = DAG.ComputeNumSignBits(Opd);
31247 if (Opd.getOpcode() == ISD::ZERO_EXTEND)
31248 IsPositive[i] = true;
31252 bool AllPositive = IsPositive[0] && IsPositive[1];
31253 unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
31254 // When ranges are from -128 ~ 127, use MULS8 mode.
31255 if (MinSignBits >= 25)
31257 // When ranges are from 0 ~ 255, use MULU8 mode.
31258 else if (AllPositive && MinSignBits >= 24)
31260 // When ranges are from -32768 ~ 32767, use MULS16 mode.
31261 else if (MinSignBits >= 17)
31263 // When ranges are from 0 ~ 65535, use MULU16 mode.
31264 else if (AllPositive && MinSignBits >= 16)
31271 /// When the operands of vector mul are extended from smaller size values,
31272 /// like i8 and i16, the type of mul may be shrinked to generate more
31273 /// efficient code. Two typical patterns are handled:
31275 /// %2 = sext/zext <N x i8> %1 to <N x i32>
31276 /// %4 = sext/zext <N x i8> %3 to <N x i32>
31277 // or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31278 /// %5 = mul <N x i32> %2, %4
31281 /// %2 = zext/sext <N x i16> %1 to <N x i32>
31282 /// %4 = zext/sext <N x i16> %3 to <N x i32>
31283 /// or %4 = build_vector <N x i32> %C1, ..., %CN (%C1..%CN are constants)
31284 /// %5 = mul <N x i32> %2, %4
31286 /// There are four mul shrinking modes:
31287 /// If %2 == sext32(trunc8(%2)), i.e., the scalar value range of %2 is
31288 /// -128 to 128, and the scalar value range of %4 is also -128 to 128,
31289 /// generate pmullw+sext32 for it (MULS8 mode).
31290 /// If %2 == zext32(trunc8(%2)), i.e., the scalar value range of %2 is
31291 /// 0 to 255, and the scalar value range of %4 is also 0 to 255,
31292 /// generate pmullw+zext32 for it (MULU8 mode).
31293 /// If %2 == sext32(trunc16(%2)), i.e., the scalar value range of %2 is
31294 /// -32768 to 32767, and the scalar value range of %4 is also -32768 to 32767,
31295 /// generate pmullw+pmulhw for it (MULS16 mode).
31296 /// If %2 == zext32(trunc16(%2)), i.e., the scalar value range of %2 is
31297 /// 0 to 65535, and the scalar value range of %4 is also 0 to 65535,
31298 /// generate pmullw+pmulhuw for it (MULU16 mode).
31299 static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
31300 const X86Subtarget &Subtarget) {
31301 // Check for legality
31302 // pmullw/pmulhw are not supported by SSE.
31303 if (!Subtarget.hasSSE2())
31306 // Check for profitability
31307 // pmulld is supported since SSE41. It is better to use pmulld
31308 // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
31310 bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
31311 if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
31315 if (!canReduceVMulWidth(N, DAG, Mode))
31319 SDValue N0 = N->getOperand(0);
31320 SDValue N1 = N->getOperand(1);
31321 EVT VT = N->getOperand(0).getValueType();
31322 unsigned RegSize = 128;
31323 MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
31325 EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
31326 // Shrink the operands of mul.
31327 SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
31328 SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
31330 if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
31331 // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
31332 // lower part is needed.
31333 SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
31334 if (Mode == MULU8 || Mode == MULS8) {
31335 return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
31338 MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
31339 // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
31340 // the higher part is also needed.
31341 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31342 ReducedVT, NewN0, NewN1);
31344 // Repack the lower part and higher part result of mul into a wider
31346 // Generate shuffle functioning as punpcklwd.
31347 SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
31348 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31349 ShuffleMask[2 * i] = i;
31350 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
31353 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31354 ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
31355 // Generate shuffle functioning as punpckhwd.
31356 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
31357 ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
31358 ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
31361 DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
31362 ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
31363 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
31366 // When VT.getVectorNumElements() < OpsVT.getVectorNumElements(), we want
31367 // to legalize the mul explicitly because implicit legalization for type
31368 // <4 x i16> to <4 x i32> sometimes involves unnecessary unpack
31369 // instructions which will not exist when we explicitly legalize it by
31370 // extending <4 x i16> to <8 x i16> (concatenating the <4 x i16> val with
31371 // <4 x i16> undef).
31373 // Legalize the operands of mul.
31374 // FIXME: We may be able to handle non-concatenated vectors by insertion.
31375 unsigned ReducedSizeInBits = ReducedVT.getSizeInBits();
31376 if ((RegSize % ReducedSizeInBits) != 0)
31379 SmallVector<SDValue, 16> Ops(RegSize / ReducedSizeInBits,
31380 DAG.getUNDEF(ReducedVT));
31382 NewN0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31384 NewN1 = DAG.getNode(ISD::CONCAT_VECTORS, DL, OpsVT, Ops);
31386 if (Mode == MULU8 || Mode == MULS8) {
31387 // Generate lower part of mul: pmullw. For MULU8/MULS8, only the lower
31389 SDValue Mul = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31391 // convert the type of mul result to VT.
31392 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31393 SDValue Res = DAG.getNode(Mode == MULU8 ? ISD::ZERO_EXTEND_VECTOR_INREG
31394 : ISD::SIGN_EXTEND_VECTOR_INREG,
31396 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31397 DAG.getIntPtrConstant(0, DL));
31399 // Generate the lower and higher part of mul: pmulhw/pmulhuw. For
31400 // MULU16/MULS16, both parts are needed.
31401 SDValue MulLo = DAG.getNode(ISD::MUL, DL, OpsVT, NewN0, NewN1);
31402 SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
31403 OpsVT, NewN0, NewN1);
31405 // Repack the lower part and higher part result of mul into a wider
31406 // result. Make sure the type of mul result is VT.
31407 MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
31408 SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
31409 Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
31410 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
31411 DAG.getIntPtrConstant(0, DL));
31416 static SDValue combineMulSpecial(uint64_t MulAmt, SDNode *N, SelectionDAG &DAG,
31417 EVT VT, SDLoc DL) {
31419 auto combineMulShlAddOrSub = [&](int Mult, int Shift, bool isAdd) {
31420 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31421 DAG.getConstant(Mult, DL, VT));
31422 Result = DAG.getNode(ISD::SHL, DL, VT, Result,
31423 DAG.getConstant(Shift, DL, MVT::i8));
31424 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31429 auto combineMulMulAddOrSub = [&](bool isAdd) {
31430 SDValue Result = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31431 DAG.getConstant(9, DL, VT));
31432 Result = DAG.getNode(ISD::MUL, DL, VT, Result, DAG.getConstant(3, DL, VT));
31433 Result = DAG.getNode(isAdd ? ISD::ADD : ISD::SUB, DL, VT, Result,
31442 // mul x, 11 => add ((shl (mul x, 5), 1), x)
31443 return combineMulShlAddOrSub(5, 1, /*isAdd*/ true);
31445 // mul x, 21 => add ((shl (mul x, 5), 2), x)
31446 return combineMulShlAddOrSub(5, 2, /*isAdd*/ true);
31448 // mul x, 22 => add (add ((shl (mul x, 5), 2), x), x)
31449 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31450 combineMulShlAddOrSub(5, 2, /*isAdd*/ true));
31452 // mul x, 19 => sub ((shl (mul x, 5), 2), x)
31453 return combineMulShlAddOrSub(5, 2, /*isAdd*/ false);
31455 // mul x, 13 => add ((shl (mul x, 3), 2), x)
31456 return combineMulShlAddOrSub(3, 2, /*isAdd*/ true);
31458 // mul x, 13 => sub ((shl (mul x, 3), 3), x)
31459 return combineMulShlAddOrSub(3, 3, /*isAdd*/ false);
31461 // mul x, 14 => add (add ((shl (mul x, 3), 2), x), x)
31462 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31463 combineMulShlAddOrSub(3, 2, /*isAdd*/ true));
31465 // mul x, 26 => sub ((mul (mul x, 9), 3), x)
31466 return combineMulMulAddOrSub(/*isAdd*/ false);
31468 // mul x, 28 => add ((mul (mul x, 9), 3), x)
31469 return combineMulMulAddOrSub(/*isAdd*/ true);
31471 // mul x, 29 => add (add ((mul (mul x, 9), 3), x), x)
31472 return DAG.getNode(ISD::ADD, DL, VT, N->getOperand(0),
31473 combineMulMulAddOrSub(/*isAdd*/ true));
31475 // mul x, 30 => sub (sub ((shl x, 5), x), x)
31476 return DAG.getNode(
31478 DAG.getNode(ISD::SUB, DL, VT,
31479 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31480 DAG.getConstant(5, DL, MVT::i8)),
31487 /// Optimize a single multiply with constant into two operations in order to
31488 /// implement it with two cheaper instructions, e.g. LEA + SHL, LEA + LEA.
31489 static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
31490 TargetLowering::DAGCombinerInfo &DCI,
31491 const X86Subtarget &Subtarget) {
31492 EVT VT = N->getValueType(0);
31493 if (DCI.isBeforeLegalize() && VT.isVector())
31494 return reduceVMULWidth(N, DAG, Subtarget);
31496 if (!MulConstantOptimization)
31498 // An imul is usually smaller than the alternative sequence.
31499 if (DAG.getMachineFunction().getFunction()->optForMinSize())
31502 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
31505 if (VT != MVT::i64 && VT != MVT::i32)
31508 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
31511 uint64_t MulAmt = C->getZExtValue();
31512 if (isPowerOf2_64(MulAmt) || MulAmt == 3 || MulAmt == 5 || MulAmt == 9)
31515 uint64_t MulAmt1 = 0;
31516 uint64_t MulAmt2 = 0;
31517 if ((MulAmt % 9) == 0) {
31519 MulAmt2 = MulAmt / 9;
31520 } else if ((MulAmt % 5) == 0) {
31522 MulAmt2 = MulAmt / 5;
31523 } else if ((MulAmt % 3) == 0) {
31525 MulAmt2 = MulAmt / 3;
31531 (isPowerOf2_64(MulAmt2) || MulAmt2 == 3 || MulAmt2 == 5 || MulAmt2 == 9)){
31533 if (isPowerOf2_64(MulAmt2) &&
31534 !(N->hasOneUse() && N->use_begin()->getOpcode() == ISD::ADD))
31535 // If second multiplifer is pow2, issue it first. We want the multiply by
31536 // 3, 5, or 9 to be folded into the addressing mode unless the lone use
31538 std::swap(MulAmt1, MulAmt2);
31540 if (isPowerOf2_64(MulAmt1))
31541 NewMul = DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31542 DAG.getConstant(Log2_64(MulAmt1), DL, MVT::i8));
31544 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, N->getOperand(0),
31545 DAG.getConstant(MulAmt1, DL, VT));
31547 if (isPowerOf2_64(MulAmt2))
31548 NewMul = DAG.getNode(ISD::SHL, DL, VT, NewMul,
31549 DAG.getConstant(Log2_64(MulAmt2), DL, MVT::i8));
31551 NewMul = DAG.getNode(X86ISD::MUL_IMM, DL, VT, NewMul,
31552 DAG.getConstant(MulAmt2, DL, VT));
31553 } else if (!Subtarget.slowLEA())
31554 NewMul = combineMulSpecial(MulAmt, N, DAG, VT, DL);
31557 assert(MulAmt != 0 &&
31558 MulAmt != (VT == MVT::i64 ? UINT64_MAX : UINT32_MAX) &&
31559 "Both cases that could cause potential overflows should have "
31560 "already been handled.");
31561 int64_t SignMulAmt = C->getSExtValue();
31562 if ((SignMulAmt != INT64_MIN) && (SignMulAmt != INT64_MAX) &&
31563 (SignMulAmt != -INT64_MAX)) {
31564 int NumSign = SignMulAmt > 0 ? 1 : -1;
31565 bool IsPowerOf2_64PlusOne = isPowerOf2_64(NumSign * SignMulAmt - 1);
31566 bool IsPowerOf2_64MinusOne = isPowerOf2_64(NumSign * SignMulAmt + 1);
31567 if (IsPowerOf2_64PlusOne) {
31568 // (mul x, 2^N + 1) => (add (shl x, N), x)
31569 NewMul = DAG.getNode(
31570 ISD::ADD, DL, VT, N->getOperand(0),
31571 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31572 DAG.getConstant(Log2_64(NumSign * SignMulAmt - 1), DL,
31574 } else if (IsPowerOf2_64MinusOne) {
31575 // (mul x, 2^N - 1) => (sub (shl x, N), x)
31576 NewMul = DAG.getNode(
31578 DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
31579 DAG.getConstant(Log2_64(NumSign * SignMulAmt + 1), DL,
31583 // To negate, subtract the number from zero
31584 if ((IsPowerOf2_64PlusOne || IsPowerOf2_64MinusOne) && NumSign == -1)
31586 DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), NewMul);
31591 // Do not add new nodes to DAG combiner worklist.
31592 DCI.CombineTo(N, NewMul, false);
31597 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
31598 SDValue N0 = N->getOperand(0);
31599 SDValue N1 = N->getOperand(1);
31600 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
31601 EVT VT = N0.getValueType();
31603 // fold (shl (and (setcc_c), c1), c2) -> (and setcc_c, (c1 << c2))
31604 // since the result of setcc_c is all zero's or all ones.
31605 if (VT.isInteger() && !VT.isVector() &&
31606 N1C && N0.getOpcode() == ISD::AND &&
31607 N0.getOperand(1).getOpcode() == ISD::Constant) {
31608 SDValue N00 = N0.getOperand(0);
31609 APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
31610 Mask <<= N1C->getAPIntValue();
31611 bool MaskOK = false;
31612 // We can handle cases concerning bit-widening nodes containing setcc_c if
31613 // we carefully interrogate the mask to make sure we are semantics
31615 // The transform is not safe if the result of C1 << C2 exceeds the bitwidth
31616 // of the underlying setcc_c operation if the setcc_c was zero extended.
31617 // Consider the following example:
31618 // zext(setcc_c) -> i32 0x0000FFFF
31619 // c1 -> i32 0x0000FFFF
31620 // c2 -> i32 0x00000001
31621 // (shl (and (setcc_c), c1), c2) -> i32 0x0001FFFE
31622 // (and setcc_c, (c1 << c2)) -> i32 0x0000FFFE
31623 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
31625 } else if (N00.getOpcode() == ISD::SIGN_EXTEND &&
31626 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31628 } else if ((N00.getOpcode() == ISD::ZERO_EXTEND ||
31629 N00.getOpcode() == ISD::ANY_EXTEND) &&
31630 N00.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY) {
31631 MaskOK = Mask.isIntN(N00.getOperand(0).getValueSizeInBits());
31633 if (MaskOK && Mask != 0) {
31635 return DAG.getNode(ISD::AND, DL, VT, N00, DAG.getConstant(Mask, DL, VT));
31639 // Hardware support for vector shifts is sparse which makes us scalarize the
31640 // vector operations in many cases. Also, on sandybridge ADD is faster than
31642 // (shl V, 1) -> add V,V
31643 if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
31644 if (auto *N1SplatC = N1BV->getConstantSplatNode()) {
31645 assert(N0.getValueType().isVector() && "Invalid vector shift type");
31646 // We shift all of the values by one. In many cases we do not have
31647 // hardware support for this operation. This is better expressed as an ADD
31649 if (N1SplatC->getAPIntValue() == 1)
31650 return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
31656 static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
31657 SDValue N0 = N->getOperand(0);
31658 SDValue N1 = N->getOperand(1);
31659 EVT VT = N0.getValueType();
31660 unsigned Size = VT.getSizeInBits();
31662 // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
31663 // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
31664 // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
31665 // depending on sign of (SarConst - [56,48,32,24,16])
31667 // sexts in X86 are MOVs. The MOVs have the same code size
31668 // as above SHIFTs (only SHIFT on 1 has lower code size).
31669 // However the MOVs have 2 advantages to a SHIFT:
31670 // 1. MOVs can write to a register that differs from source
31671 // 2. MOVs accept memory operands
31673 if (!VT.isInteger() || VT.isVector() || N1.getOpcode() != ISD::Constant ||
31674 N0.getOpcode() != ISD::SHL || !N0.hasOneUse() ||
31675 N0.getOperand(1).getOpcode() != ISD::Constant)
31678 SDValue N00 = N0.getOperand(0);
31679 SDValue N01 = N0.getOperand(1);
31680 APInt ShlConst = (cast<ConstantSDNode>(N01))->getAPIntValue();
31681 APInt SarConst = (cast<ConstantSDNode>(N1))->getAPIntValue();
31682 EVT CVT = N1.getValueType();
31684 if (SarConst.isNegative())
31687 for (MVT SVT : MVT::integer_valuetypes()) {
31688 unsigned ShiftSize = SVT.getSizeInBits();
31689 // skipping types without corresponding sext/zext and
31690 // ShlConst that is not one of [56,48,32,24,16]
31691 if (ShiftSize < 8 || ShiftSize > 64 || ShlConst != Size - ShiftSize)
31695 DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N00, DAG.getValueType(SVT));
31696 SarConst = SarConst - (Size - ShiftSize);
31699 else if (SarConst.isNegative())
31700 return DAG.getNode(ISD::SHL, DL, VT, NN,
31701 DAG.getConstant(-SarConst, DL, CVT));
31703 return DAG.getNode(ISD::SRA, DL, VT, NN,
31704 DAG.getConstant(SarConst, DL, CVT));
31709 /// \brief Returns a vector of 0s if the node in input is a vector logical
31710 /// shift by a constant amount which is known to be bigger than or equal
31711 /// to the vector element size in bits.
31712 static SDValue performShiftToAllZeros(SDNode *N, SelectionDAG &DAG,
31713 const X86Subtarget &Subtarget) {
31714 EVT VT = N->getValueType(0);
31716 if (VT != MVT::v2i64 && VT != MVT::v4i32 && VT != MVT::v8i16 &&
31717 (!Subtarget.hasInt256() ||
31718 (VT != MVT::v4i64 && VT != MVT::v8i32 && VT != MVT::v16i16)))
31721 SDValue Amt = N->getOperand(1);
31723 if (auto *AmtBV = dyn_cast<BuildVectorSDNode>(Amt))
31724 if (auto *AmtSplat = AmtBV->getConstantSplatNode()) {
31725 const APInt &ShiftAmt = AmtSplat->getAPIntValue();
31726 unsigned MaxAmount =
31727 VT.getSimpleVT().getScalarSizeInBits();
31729 // SSE2/AVX2 logical shifts always return a vector of 0s
31730 // if the shift amount is bigger than or equal to
31731 // the element size. The constant shift amount will be
31732 // encoded as a 8-bit immediate.
31733 if (ShiftAmt.trunc(8).uge(MaxAmount))
31734 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, DL);
31740 static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
31741 TargetLowering::DAGCombinerInfo &DCI,
31742 const X86Subtarget &Subtarget) {
31743 if (N->getOpcode() == ISD::SHL)
31744 if (SDValue V = combineShiftLeft(N, DAG))
31747 if (N->getOpcode() == ISD::SRA)
31748 if (SDValue V = combineShiftRightAlgebraic(N, DAG))
31751 // Try to fold this logical shift into a zero vector.
31752 if (N->getOpcode() != ISD::SRA)
31753 if (SDValue V = performShiftToAllZeros(N, DAG, Subtarget))
31759 static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
31760 TargetLowering::DAGCombinerInfo &DCI,
31761 const X86Subtarget &Subtarget) {
31762 unsigned Opcode = N->getOpcode();
31763 assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
31764 X86ISD::VSRLI == Opcode) &&
31765 "Unexpected shift opcode");
31766 bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
31767 EVT VT = N->getValueType(0);
31768 SDValue N0 = N->getOperand(0);
31769 SDValue N1 = N->getOperand(1);
31770 unsigned NumBitsPerElt = VT.getScalarSizeInBits();
31771 assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
31772 "Unexpected value type");
31774 // Out of range logical bit shifts are guaranteed to be zero.
31775 // Out of range arithmetic bit shifts splat the sign bit.
31776 APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
31777 if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
31779 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31781 ShiftVal = NumBitsPerElt - 1;
31784 // Shift N0 by zero -> N0.
31788 // Shift zero -> zero.
31789 if (ISD::isBuildVectorAllZeros(N0.getNode()))
31790 return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
31792 // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
31793 // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
31794 // TODO - support other sra opcodes as needed.
31795 if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
31796 N0.getOpcode() == X86ISD::VSRAI)
31797 return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
31799 // We can decode 'whole byte' logical bit shifts as shuffles.
31800 if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
31802 SmallVector<int, 1> NonceMask; // Just a placeholder.
31803 NonceMask.push_back(0);
31804 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31805 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31807 return SDValue(); // This routine will use CombineTo to replace N.
31810 // Constant Folding.
31812 SmallVector<APInt, 32> EltBits;
31813 if (N->isOnlyUserOf(N0.getNode()) &&
31814 getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
31815 assert(EltBits.size() == VT.getVectorNumElements() &&
31816 "Unexpected shift value type");
31817 unsigned ShiftImm = ShiftVal.getZExtValue();
31818 for (APInt &Elt : EltBits) {
31819 if (X86ISD::VSHLI == Opcode)
31821 else if (X86ISD::VSRAI == Opcode)
31822 Elt.ashrInPlace(ShiftImm);
31824 Elt.lshrInPlace(ShiftImm);
31826 return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
31832 static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
31833 TargetLowering::DAGCombinerInfo &DCI,
31834 const X86Subtarget &Subtarget) {
31836 ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
31837 (N->getOpcode() == X86ISD::PINSRW &&
31838 N->getValueType(0) == MVT::v8i16)) &&
31839 "Unexpected vector insertion");
31841 // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
31843 SmallVector<int, 1> NonceMask; // Just a placeholder.
31844 NonceMask.push_back(0);
31845 combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
31846 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
31851 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
31852 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
31853 /// OR -> CMPNEQSS.
31854 static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
31855 TargetLowering::DAGCombinerInfo &DCI,
31856 const X86Subtarget &Subtarget) {
31859 // SSE1 supports CMP{eq|ne}SS, and SSE2 added CMP{eq|ne}SD, but
31860 // we're requiring SSE2 for both.
31861 if (Subtarget.hasSSE2() && isAndOrOfSetCCs(SDValue(N, 0U), opcode)) {
31862 SDValue N0 = N->getOperand(0);
31863 SDValue N1 = N->getOperand(1);
31864 SDValue CMP0 = N0->getOperand(1);
31865 SDValue CMP1 = N1->getOperand(1);
31868 // The SETCCs should both refer to the same CMP.
31869 if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
31872 SDValue CMP00 = CMP0->getOperand(0);
31873 SDValue CMP01 = CMP0->getOperand(1);
31874 EVT VT = CMP00.getValueType();
31876 if (VT == MVT::f32 || VT == MVT::f64) {
31877 bool ExpectingFlags = false;
31878 // Check for any users that want flags:
31879 for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
31880 !ExpectingFlags && UI != UE; ++UI)
31881 switch (UI->getOpcode()) {
31886 ExpectingFlags = true;
31888 case ISD::CopyToReg:
31889 case ISD::SIGN_EXTEND:
31890 case ISD::ZERO_EXTEND:
31891 case ISD::ANY_EXTEND:
31895 if (!ExpectingFlags) {
31896 enum X86::CondCode cc0 = (enum X86::CondCode)N0.getConstantOperandVal(0);
31897 enum X86::CondCode cc1 = (enum X86::CondCode)N1.getConstantOperandVal(0);
31899 if (cc1 == X86::COND_E || cc1 == X86::COND_NE) {
31900 X86::CondCode tmp = cc0;
31905 if ((cc0 == X86::COND_E && cc1 == X86::COND_NP) ||
31906 (cc0 == X86::COND_NE && cc1 == X86::COND_P)) {
31907 // FIXME: need symbolic constants for these magic numbers.
31908 // See X86ATTInstPrinter.cpp:printSSECC().
31909 unsigned x86cc = (cc0 == X86::COND_E) ? 0 : 4;
31910 if (Subtarget.hasAVX512()) {
31912 DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
31913 DAG.getConstant(x86cc, DL, MVT::i8));
31914 return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
31915 FSetCC, DAG.getIntPtrConstant(0, DL));
31917 SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
31918 CMP00.getValueType(), CMP00, CMP01,
31919 DAG.getConstant(x86cc, DL,
31922 bool is64BitFP = (CMP00.getValueType() == MVT::f64);
31923 MVT IntVT = is64BitFP ? MVT::i64 : MVT::i32;
31925 if (is64BitFP && !Subtarget.is64Bit()) {
31926 // On a 32-bit target, we cannot bitcast the 64-bit float to a
31927 // 64-bit integer, since that's not a legal type. Since
31928 // OnesOrZeroesF is all ones of all zeroes, we don't need all the
31929 // bits, but can do this little dance to extract the lowest 32 bits
31930 // and work with those going forward.
31931 SDValue Vector64 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
31933 SDValue Vector32 = DAG.getBitcast(MVT::v4f32, Vector64);
31934 OnesOrZeroesF = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32,
31935 Vector32, DAG.getIntPtrConstant(0, DL));
31939 SDValue OnesOrZeroesI = DAG.getBitcast(IntVT, OnesOrZeroesF);
31940 SDValue ANDed = DAG.getNode(ISD::AND, DL, IntVT, OnesOrZeroesI,
31941 DAG.getConstant(1, DL, IntVT));
31942 SDValue OneBitOfTruth = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
31944 return OneBitOfTruth;
31952 /// Try to fold: (and (xor X, -1), Y) -> (andnp X, Y).
31953 static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
31954 assert(N->getOpcode() == ISD::AND);
31956 EVT VT = N->getValueType(0);
31957 SDValue N0 = N->getOperand(0);
31958 SDValue N1 = N->getOperand(1);
31961 if (VT != MVT::v2i64 && VT != MVT::v4i64 && VT != MVT::v8i64)
31964 if (N0.getOpcode() == ISD::XOR &&
31965 ISD::isBuildVectorAllOnes(N0.getOperand(1).getNode()))
31966 return DAG.getNode(X86ISD::ANDNP, DL, VT, N0.getOperand(0), N1);
31968 if (N1.getOpcode() == ISD::XOR &&
31969 ISD::isBuildVectorAllOnes(N1.getOperand(1).getNode()))
31970 return DAG.getNode(X86ISD::ANDNP, DL, VT, N1.getOperand(0), N0);
31975 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
31976 // register. In most cases we actually compare or select YMM-sized registers
31977 // and mixing the two types creates horrible code. This method optimizes
31978 // some of the transition sequences.
31979 static SDValue WidenMaskArithmetic(SDNode *N, SelectionDAG &DAG,
31980 TargetLowering::DAGCombinerInfo &DCI,
31981 const X86Subtarget &Subtarget) {
31982 EVT VT = N->getValueType(0);
31983 if (!VT.is256BitVector())
31986 assert((N->getOpcode() == ISD::ANY_EXTEND ||
31987 N->getOpcode() == ISD::ZERO_EXTEND ||
31988 N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
31990 SDValue Narrow = N->getOperand(0);
31991 EVT NarrowVT = Narrow->getValueType(0);
31992 if (!NarrowVT.is128BitVector())
31995 if (Narrow->getOpcode() != ISD::XOR &&
31996 Narrow->getOpcode() != ISD::AND &&
31997 Narrow->getOpcode() != ISD::OR)
32000 SDValue N0 = Narrow->getOperand(0);
32001 SDValue N1 = Narrow->getOperand(1);
32004 // The Left side has to be a trunc.
32005 if (N0.getOpcode() != ISD::TRUNCATE)
32008 // The type of the truncated inputs.
32009 EVT WideVT = N0->getOperand(0)->getValueType(0);
32013 // The right side has to be a 'trunc' or a constant vector.
32014 bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE;
32015 ConstantSDNode *RHSConstSplat = nullptr;
32016 if (auto *RHSBV = dyn_cast<BuildVectorSDNode>(N1))
32017 RHSConstSplat = RHSBV->getConstantSplatNode();
32018 if (!RHSTrunc && !RHSConstSplat)
32021 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32023 if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), WideVT))
32026 // Set N0 and N1 to hold the inputs to the new wide operation.
32027 N0 = N0->getOperand(0);
32028 if (RHSConstSplat) {
32029 N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, WideVT.getVectorElementType(),
32030 SDValue(RHSConstSplat, 0));
32031 N1 = DAG.getSplatBuildVector(WideVT, DL, N1);
32032 } else if (RHSTrunc) {
32033 N1 = N1->getOperand(0);
32036 // Generate the wide operation.
32037 SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, WideVT, N0, N1);
32038 unsigned Opcode = N->getOpcode();
32040 case ISD::ANY_EXTEND:
32042 case ISD::ZERO_EXTEND: {
32043 unsigned InBits = NarrowVT.getScalarSizeInBits();
32044 APInt Mask = APInt::getAllOnesValue(InBits);
32045 Mask = Mask.zext(VT.getScalarSizeInBits());
32046 return DAG.getNode(ISD::AND, DL, VT,
32047 Op, DAG.getConstant(Mask, DL, VT));
32049 case ISD::SIGN_EXTEND:
32050 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
32051 Op, DAG.getValueType(NarrowVT));
32053 llvm_unreachable("Unexpected opcode");
32057 /// If both input operands of a logic op are being cast from floating point
32058 /// types, try to convert this into a floating point logic node to avoid
32059 /// unnecessary moves from SSE to integer registers.
32060 static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
32061 const X86Subtarget &Subtarget) {
32062 unsigned FPOpcode = ISD::DELETED_NODE;
32063 if (N->getOpcode() == ISD::AND)
32064 FPOpcode = X86ISD::FAND;
32065 else if (N->getOpcode() == ISD::OR)
32066 FPOpcode = X86ISD::FOR;
32067 else if (N->getOpcode() == ISD::XOR)
32068 FPOpcode = X86ISD::FXOR;
32070 assert(FPOpcode != ISD::DELETED_NODE &&
32071 "Unexpected input node for FP logic conversion");
32073 EVT VT = N->getValueType(0);
32074 SDValue N0 = N->getOperand(0);
32075 SDValue N1 = N->getOperand(1);
32077 if (N0.getOpcode() == ISD::BITCAST && N1.getOpcode() == ISD::BITCAST &&
32078 ((Subtarget.hasSSE1() && VT == MVT::i32) ||
32079 (Subtarget.hasSSE2() && VT == MVT::i64))) {
32080 SDValue N00 = N0.getOperand(0);
32081 SDValue N10 = N1.getOperand(0);
32082 EVT N00Type = N00.getValueType();
32083 EVT N10Type = N10.getValueType();
32084 if (N00Type.isFloatingPoint() && N10Type.isFloatingPoint()) {
32085 SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
32086 return DAG.getBitcast(VT, FPLogic);
32092 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
32093 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
32094 /// with a shift-right to eliminate loading the vector constant mask value.
32095 static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
32096 const X86Subtarget &Subtarget) {
32097 SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
32098 SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
32099 EVT VT0 = Op0.getValueType();
32100 EVT VT1 = Op1.getValueType();
32102 if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
32106 if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal,
32107 /*AllowShrink*/false) ||
32108 !SplatVal.isMask())
32111 if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
32114 unsigned EltBitWidth = VT0.getScalarSizeInBits();
32115 if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
32119 unsigned ShiftVal = SplatVal.countTrailingOnes();
32120 SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
32121 SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
32122 return DAG.getBitcast(N->getValueType(0), Shift);
32125 static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
32126 TargetLowering::DAGCombinerInfo &DCI,
32127 const X86Subtarget &Subtarget) {
32128 if (DCI.isBeforeLegalizeOps())
32131 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32134 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32137 if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
32140 if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
32143 EVT VT = N->getValueType(0);
32144 SDValue N0 = N->getOperand(0);
32145 SDValue N1 = N->getOperand(1);
32148 // Attempt to recursively combine a bitmask AND with shuffles.
32149 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
32151 SmallVector<int, 1> NonceMask; // Just a placeholder.
32152 NonceMask.push_back(0);
32153 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
32154 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
32156 return SDValue(); // This routine will use CombineTo to replace N.
32159 // Create BEXTR instructions
32160 // BEXTR is ((X >> imm) & (2**size-1))
32161 if (VT != MVT::i32 && VT != MVT::i64)
32164 if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
32166 if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
32169 ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
32170 ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
32171 if (MaskNode && ShiftNode) {
32172 uint64_t Mask = MaskNode->getZExtValue();
32173 uint64_t Shift = ShiftNode->getZExtValue();
32174 if (isMask_64(Mask)) {
32175 uint64_t MaskSize = countPopulation(Mask);
32176 if (Shift + MaskSize <= VT.getSizeInBits())
32177 return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
32178 DAG.getConstant(Shift | (MaskSize << 8), DL,
32186 // (or (and (m, y), (pandn m, x)))
32188 // (vselect m, x, y)
32189 // As a special case, try to fold:
32190 // (or (and (m, (sub 0, x)), (pandn m, x)))
32192 // (sub (xor X, M), M)
32193 static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
32194 const X86Subtarget &Subtarget) {
32195 assert(N->getOpcode() == ISD::OR && "Unexpected Opcode");
32197 SDValue N0 = N->getOperand(0);
32198 SDValue N1 = N->getOperand(1);
32199 EVT VT = N->getValueType(0);
32201 if (!((VT.is128BitVector() && Subtarget.hasSSE2()) ||
32202 (VT.is256BitVector() && Subtarget.hasInt256())))
32205 // Canonicalize AND to LHS.
32206 if (N1.getOpcode() == ISD::AND)
32209 // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for
32210 // ANDNP combine allows other combines to happen that prevent matching.
32211 if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP)
32214 SDValue Mask = N1.getOperand(0);
32215 SDValue X = N1.getOperand(1);
32217 if (N0.getOperand(0) == Mask)
32218 Y = N0.getOperand(1);
32219 if (N0.getOperand(1) == Mask)
32220 Y = N0.getOperand(0);
32222 // Check to see if the mask appeared in both the AND and ANDNP.
32226 // Validate that X, Y, and Mask are bitcasts, and see through them.
32227 Mask = peekThroughBitcasts(Mask);
32228 X = peekThroughBitcasts(X);
32229 Y = peekThroughBitcasts(Y);
32231 EVT MaskVT = Mask.getValueType();
32232 unsigned EltBits = MaskVT.getScalarSizeInBits();
32234 // TODO: Attempt to handle floating point cases as well?
32235 if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits)
32241 // (or (and (M, (sub 0, X)), (pandn M, X)))
32242 // which is a special case of vselect:
32243 // (vselect M, (sub 0, X), X)
32245 // http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
32246 // We know that, if fNegate is 0 or 1:
32247 // (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
32249 // Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
32250 // ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
32251 // ( M ? -X : X) == ((X ^ M ) + (M & 1))
32252 // This lets us transform our vselect to:
32253 // (add (xor X, M), (and M, 1))
32255 // (sub (xor X, M), M)
32256 if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT &&
32257 DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) {
32258 auto IsNegV = [](SDNode *N, SDValue V) {
32259 return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
32260 ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
32263 if (IsNegV(Y.getNode(), X))
32265 else if (IsNegV(X.getNode(), Y))
32269 SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
32270 SDValue SubOp2 = Mask;
32272 // If the negate was on the false side of the select, then
32273 // the operands of the SUB need to be swapped. PR 27251.
32274 // This is because the pattern being matched above is
32275 // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
32276 // but if the pattern matched was
32277 // (vselect M, X, (sub (0, X))), that is really negation of the pattern
32278 // above, -(vselect M, (sub 0, X), X), and therefore the replacement
32279 // pattern also needs to be a negation of the replacement pattern above.
32280 // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
32281 // sub accomplishes the negation of the replacement pattern.
32283 std::swap(SubOp1, SubOp2);
32285 SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
32286 return DAG.getBitcast(VT, Res);
32290 // PBLENDVB is only available on SSE 4.1.
32291 if (!Subtarget.hasSSE41())
32294 MVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
32296 X = DAG.getBitcast(BlendVT, X);
32297 Y = DAG.getBitcast(BlendVT, Y);
32298 Mask = DAG.getBitcast(BlendVT, Mask);
32299 Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X);
32300 return DAG.getBitcast(VT, Mask);
32303 // Helper function for combineOrCmpEqZeroToCtlzSrl
32307 // srl(ctlz x), log2(bitsize(x))
32308 // Input pattern is checked by caller.
32309 static SDValue lowerX86CmpEqZeroToCtlzSrl(SDValue Op, EVT ExtTy,
32310 SelectionDAG &DAG) {
32311 SDValue Cmp = Op.getOperand(1);
32312 EVT VT = Cmp.getOperand(0).getValueType();
32313 unsigned Log2b = Log2_32(VT.getSizeInBits());
32315 SDValue Clz = DAG.getNode(ISD::CTLZ, dl, VT, Cmp->getOperand(0));
32316 // The result of the shift is true or false, and on X86, the 32-bit
32317 // encoding of shr and lzcnt is more desirable.
32318 SDValue Trunc = DAG.getZExtOrTrunc(Clz, dl, MVT::i32);
32319 SDValue Scc = DAG.getNode(ISD::SRL, dl, MVT::i32, Trunc,
32320 DAG.getConstant(Log2b, dl, VT));
32321 return DAG.getZExtOrTrunc(Scc, dl, ExtTy);
32324 // Try to transform:
32325 // zext(or(setcc(eq, (cmp x, 0)), setcc(eq, (cmp y, 0))))
32327 // srl(or(ctlz(x), ctlz(y)), log2(bitsize(x))
32328 // Will also attempt to match more generic cases, eg:
32329 // zext(or(or(setcc(eq, cmp 0), setcc(eq, cmp 0)), setcc(eq, cmp 0)))
32330 // Only applies if the target supports the FastLZCNT feature.
32331 static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
32332 TargetLowering::DAGCombinerInfo &DCI,
32333 const X86Subtarget &Subtarget) {
32334 if (DCI.isBeforeLegalize() || !Subtarget.getTargetLowering()->isCtlzFast())
32337 auto isORCandidate = [](SDValue N) {
32338 return (N->getOpcode() == ISD::OR && N->hasOneUse());
32341 // Check the zero extend is extending to 32-bit or more. The code generated by
32342 // srl(ctlz) for 16-bit or less variants of the pattern would require extra
32343 // instructions to clear the upper bits.
32344 if (!N->hasOneUse() || !N->getSimpleValueType(0).bitsGE(MVT::i32) ||
32345 !isORCandidate(N->getOperand(0)))
32348 // Check the node matches: setcc(eq, cmp 0)
32349 auto isSetCCCandidate = [](SDValue N) {
32350 return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
32351 X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
32352 N->getOperand(1).getOpcode() == X86ISD::CMP &&
32353 isNullConstant(N->getOperand(1).getOperand(1)) &&
32354 N->getOperand(1).getValueType().bitsGE(MVT::i32);
32357 SDNode *OR = N->getOperand(0).getNode();
32358 SDValue LHS = OR->getOperand(0);
32359 SDValue RHS = OR->getOperand(1);
32361 // Save nodes matching or(or, setcc(eq, cmp 0)).
32362 SmallVector<SDNode *, 2> ORNodes;
32363 while (((isORCandidate(LHS) && isSetCCCandidate(RHS)) ||
32364 (isORCandidate(RHS) && isSetCCCandidate(LHS)))) {
32365 ORNodes.push_back(OR);
32366 OR = (LHS->getOpcode() == ISD::OR) ? LHS.getNode() : RHS.getNode();
32367 LHS = OR->getOperand(0);
32368 RHS = OR->getOperand(1);
32371 // The last OR node should match or(setcc(eq, cmp 0), setcc(eq, cmp 0)).
32372 if (!(isSetCCCandidate(LHS) && isSetCCCandidate(RHS)) ||
32373 !isORCandidate(SDValue(OR, 0)))
32376 // We have a or(setcc(eq, cmp 0), setcc(eq, cmp 0)) pattern, try to lower it
32378 // or(srl(ctlz),srl(ctlz)).
32379 // The dag combiner can then fold it into:
32380 // srl(or(ctlz, ctlz)).
32381 EVT VT = OR->getValueType(0);
32382 SDValue NewLHS = lowerX86CmpEqZeroToCtlzSrl(LHS, VT, DAG);
32383 SDValue Ret, NewRHS;
32384 if (NewLHS && (NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG)))
32385 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, NewLHS, NewRHS);
32390 // Try to lower nodes matching the or(or, setcc(eq, cmp 0)) pattern.
32391 while (ORNodes.size() > 0) {
32392 OR = ORNodes.pop_back_val();
32393 LHS = OR->getOperand(0);
32394 RHS = OR->getOperand(1);
32395 // Swap rhs with lhs to match or(setcc(eq, cmp, 0), or).
32396 if (RHS->getOpcode() == ISD::OR)
32397 std::swap(LHS, RHS);
32398 EVT VT = OR->getValueType(0);
32399 SDValue NewRHS = lowerX86CmpEqZeroToCtlzSrl(RHS, VT, DAG);
32402 Ret = DAG.getNode(ISD::OR, SDLoc(OR), VT, Ret, NewRHS);
32406 Ret = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret);
32411 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
32412 TargetLowering::DAGCombinerInfo &DCI,
32413 const X86Subtarget &Subtarget) {
32414 if (DCI.isBeforeLegalizeOps())
32417 if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
32420 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
32423 if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
32426 SDValue N0 = N->getOperand(0);
32427 SDValue N1 = N->getOperand(1);
32428 EVT VT = N->getValueType(0);
32430 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
32433 // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
32434 bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
32436 // SHLD/SHRD instructions have lower register pressure, but on some
32437 // platforms they have higher latency than the equivalent
32438 // series of shifts/or that would otherwise be generated.
32439 // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
32440 // have higher latencies and we are not optimizing for size.
32441 if (!OptForSize && Subtarget.isSHLDSlow())
32444 if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
32446 if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
32448 if (!N0.hasOneUse() || !N1.hasOneUse())
32451 SDValue ShAmt0 = N0.getOperand(1);
32452 if (ShAmt0.getValueType() != MVT::i8)
32454 SDValue ShAmt1 = N1.getOperand(1);
32455 if (ShAmt1.getValueType() != MVT::i8)
32457 if (ShAmt0.getOpcode() == ISD::TRUNCATE)
32458 ShAmt0 = ShAmt0.getOperand(0);
32459 if (ShAmt1.getOpcode() == ISD::TRUNCATE)
32460 ShAmt1 = ShAmt1.getOperand(0);
32463 unsigned Opc = X86ISD::SHLD;
32464 SDValue Op0 = N0.getOperand(0);
32465 SDValue Op1 = N1.getOperand(0);
32466 if (ShAmt0.getOpcode() == ISD::SUB ||
32467 ShAmt0.getOpcode() == ISD::XOR) {
32468 Opc = X86ISD::SHRD;
32469 std::swap(Op0, Op1);
32470 std::swap(ShAmt0, ShAmt1);
32473 // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> SHLD( X, Y, C )
32474 // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> SHRD( X, Y, C )
32475 // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> SHLD( X, Y, C )
32476 // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> SHRD( X, Y, C )
32477 unsigned Bits = VT.getSizeInBits();
32478 if (ShAmt1.getOpcode() == ISD::SUB) {
32479 SDValue Sum = ShAmt1.getOperand(0);
32480 if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
32481 SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
32482 if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
32483 ShAmt1Op1 = ShAmt1Op1.getOperand(0);
32484 if (SumC->getSExtValue() == Bits && ShAmt1Op1 == ShAmt0)
32485 return DAG.getNode(Opc, DL, VT,
32487 DAG.getNode(ISD::TRUNCATE, DL,
32490 } else if (ConstantSDNode *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
32491 ConstantSDNode *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
32492 if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
32493 return DAG.getNode(Opc, DL, VT,
32494 N0.getOperand(0), N1.getOperand(0),
32495 DAG.getNode(ISD::TRUNCATE, DL,
32497 } else if (ShAmt1.getOpcode() == ISD::XOR) {
32498 SDValue Mask = ShAmt1.getOperand(1);
32499 if (ConstantSDNode *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
32500 unsigned InnerShift = (X86ISD::SHLD == Opc ? ISD::SRL : ISD::SHL);
32501 SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
32502 if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
32503 ShAmt1Op0 = ShAmt1Op0.getOperand(0);
32504 if (MaskC->getSExtValue() == (Bits - 1) && ShAmt1Op0 == ShAmt0) {
32505 if (Op1.getOpcode() == InnerShift &&
32506 isa<ConstantSDNode>(Op1.getOperand(1)) &&
32507 Op1.getConstantOperandVal(1) == 1) {
32508 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32509 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32511 // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
32512 if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
32513 Op1.getOperand(0) == Op1.getOperand(1)) {
32514 return DAG.getNode(Opc, DL, VT, Op0, Op1.getOperand(0),
32515 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, ShAmt0));
32524 /// Generate NEG and CMOV for integer abs.
32525 static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
32526 EVT VT = N->getValueType(0);
32528 // Since X86 does not have CMOV for 8-bit integer, we don't convert
32529 // 8-bit integer abs to NEG and CMOV.
32530 if (VT.isInteger() && VT.getSizeInBits() == 8)
32533 SDValue N0 = N->getOperand(0);
32534 SDValue N1 = N->getOperand(1);
32537 // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
32538 // and change it to SUB and CMOV.
32539 if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
32540 N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
32541 N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
32542 auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
32543 if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
32544 // Generate SUB & CMOV.
32545 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
32546 DAG.getConstant(0, DL, VT), N0.getOperand(0));
32547 SDValue Ops[] = {N0.getOperand(0), Neg,
32548 DAG.getConstant(X86::COND_GE, DL, MVT::i8),
32549 SDValue(Neg.getNode(), 1)};
32550 return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
32556 /// Try to turn tests against the signbit in the form of:
32557 /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
32560 static SDValue foldXorTruncShiftIntoCmp(SDNode *N, SelectionDAG &DAG) {
32561 // This is only worth doing if the output type is i8 or i1.
32562 EVT ResultType = N->getValueType(0);
32563 if (ResultType != MVT::i8 && ResultType != MVT::i1)
32566 SDValue N0 = N->getOperand(0);
32567 SDValue N1 = N->getOperand(1);
32569 // We should be performing an xor against a truncated shift.
32570 if (N0.getOpcode() != ISD::TRUNCATE || !N0.hasOneUse())
32573 // Make sure we are performing an xor against one.
32574 if (!isOneConstant(N1))
32577 // SetCC on x86 zero extends so only act on this if it's a logical shift.
32578 SDValue Shift = N0.getOperand(0);
32579 if (Shift.getOpcode() != ISD::SRL || !Shift.hasOneUse())
32582 // Make sure we are truncating from one of i16, i32 or i64.
32583 EVT ShiftTy = Shift.getValueType();
32584 if (ShiftTy != MVT::i16 && ShiftTy != MVT::i32 && ShiftTy != MVT::i64)
32587 // Make sure the shift amount extracts the sign bit.
32588 if (!isa<ConstantSDNode>(Shift.getOperand(1)) ||
32589 Shift.getConstantOperandVal(1) != ShiftTy.getSizeInBits() - 1)
32592 // Create a greater-than comparison against -1.
32593 // N.B. Using SETGE against 0 works but we want a canonical looking
32594 // comparison, using SETGT matches up with what TranslateX86CC.
32596 SDValue ShiftOp = Shift.getOperand(0);
32597 EVT ShiftOpTy = ShiftOp.getValueType();
32598 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32599 EVT SetCCResultType = TLI.getSetCCResultType(DAG.getDataLayout(),
32600 *DAG.getContext(), ResultType);
32601 SDValue Cond = DAG.getSetCC(DL, SetCCResultType, ShiftOp,
32602 DAG.getConstant(-1, DL, ShiftOpTy), ISD::SETGT);
32603 if (SetCCResultType != ResultType)
32604 Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, ResultType, Cond);
32608 /// Turn vector tests of the signbit in the form of:
32609 /// xor (sra X, elt_size(X)-1), -1
32613 /// This should be called before type legalization because the pattern may not
32614 /// persist after that.
32615 static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
32616 const X86Subtarget &Subtarget) {
32617 EVT VT = N->getValueType(0);
32618 if (!VT.isSimple())
32621 switch (VT.getSimpleVT().SimpleTy) {
32622 default: return SDValue();
32625 case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
32626 case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
32630 case MVT::v4i64: if (!Subtarget.hasAVX2()) return SDValue(); break;
32633 // There must be a shift right algebraic before the xor, and the xor must be a
32634 // 'not' operation.
32635 SDValue Shift = N->getOperand(0);
32636 SDValue Ones = N->getOperand(1);
32637 if (Shift.getOpcode() != ISD::SRA || !Shift.hasOneUse() ||
32638 !ISD::isBuildVectorAllOnes(Ones.getNode()))
32641 // The shift should be smearing the sign bit across each vector element.
32642 auto *ShiftBV = dyn_cast<BuildVectorSDNode>(Shift.getOperand(1));
32646 EVT ShiftEltTy = Shift.getValueType().getVectorElementType();
32647 auto *ShiftAmt = ShiftBV->getConstantSplatNode();
32648 if (!ShiftAmt || ShiftAmt->getZExtValue() != ShiftEltTy.getSizeInBits() - 1)
32651 // Create a greater-than comparison against -1. We don't use the more obvious
32652 // greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
32653 return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
32656 /// Check if truncation with saturation form type \p SrcVT to \p DstVT
32657 /// is valid for the given \p Subtarget.
32658 static bool isSATValidOnAVX512Subtarget(EVT SrcVT, EVT DstVT,
32659 const X86Subtarget &Subtarget) {
32660 if (!Subtarget.hasAVX512())
32663 // FIXME: Scalar type may be supported if we move it to vector register.
32664 if (!SrcVT.isVector() || !SrcVT.isSimple() || SrcVT.getSizeInBits() > 512)
32667 EVT SrcElVT = SrcVT.getScalarType();
32668 EVT DstElVT = DstVT.getScalarType();
32669 if (SrcElVT.getSizeInBits() < 16 || SrcElVT.getSizeInBits() > 64)
32671 if (DstElVT.getSizeInBits() < 8 || DstElVT.getSizeInBits() > 32)
32673 if (SrcVT.is512BitVector() || Subtarget.hasVLX())
32674 return SrcElVT.getSizeInBits() >= 32 || Subtarget.hasBWI();
32678 /// Detect a pattern of truncation with saturation:
32679 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32680 /// Return the source value to be truncated or SDValue() if the pattern was not
32682 static SDValue detectUSatPattern(SDValue In, EVT VT) {
32683 if (In.getOpcode() != ISD::UMIN)
32686 //Saturation with truncation. We truncate from InVT to VT.
32687 assert(In.getScalarValueSizeInBits() > VT.getScalarSizeInBits() &&
32688 "Unexpected types for truncate operation");
32691 if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C,
32692 /*AllowShrink*/false)) {
32693 // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
32694 // the element size of the destination type.
32695 return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
32701 /// Detect a pattern of truncation with saturation:
32702 /// (truncate (umin (x, unsigned_max_of_dest_type)) to dest_type).
32703 /// The types should allow to use VPMOVUS* instruction on AVX512.
32704 /// Return the source value to be truncated or SDValue() if the pattern was not
32706 static SDValue detectAVX512USatPattern(SDValue In, EVT VT,
32707 const X86Subtarget &Subtarget) {
32708 if (!isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32710 return detectUSatPattern(In, VT);
32714 combineTruncateWithUSat(SDValue In, EVT VT, SDLoc &DL, SelectionDAG &DAG,
32715 const X86Subtarget &Subtarget) {
32716 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32717 if (!TLI.isTypeLegal(In.getValueType()) || !TLI.isTypeLegal(VT))
32719 if (auto USatVal = detectUSatPattern(In, VT))
32720 if (isSATValidOnAVX512Subtarget(In.getValueType(), VT, Subtarget))
32721 return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
32725 /// This function detects the AVG pattern between vectors of unsigned i8/i16,
32726 /// which is c = (a + b + 1) / 2, and replace this operation with the efficient
32727 /// X86ISD::AVG instruction.
32728 static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
32729 const X86Subtarget &Subtarget,
32731 if (!VT.isVector() || !VT.isSimple())
32733 EVT InVT = In.getValueType();
32734 unsigned NumElems = VT.getVectorNumElements();
32736 EVT ScalarVT = VT.getVectorElementType();
32737 if (!((ScalarVT == MVT::i8 || ScalarVT == MVT::i16) &&
32738 isPowerOf2_32(NumElems)))
32741 // InScalarVT is the intermediate type in AVG pattern and it should be greater
32742 // than the original input type (i8/i16).
32743 EVT InScalarVT = InVT.getVectorElementType();
32744 if (InScalarVT.getSizeInBits() <= ScalarVT.getSizeInBits())
32747 if (!Subtarget.hasSSE2())
32749 if (Subtarget.hasBWI()) {
32750 if (VT.getSizeInBits() > 512)
32752 } else if (Subtarget.hasAVX2()) {
32753 if (VT.getSizeInBits() > 256)
32756 if (VT.getSizeInBits() > 128)
32760 // Detect the following pattern:
32762 // %1 = zext <N x i8> %a to <N x i32>
32763 // %2 = zext <N x i8> %b to <N x i32>
32764 // %3 = add nuw nsw <N x i32> %1, <i32 1 x N>
32765 // %4 = add nuw nsw <N x i32> %3, %2
32766 // %5 = lshr <N x i32> %N, <i32 1 x N>
32767 // %6 = trunc <N x i32> %5 to <N x i8>
32769 // In AVX512, the last instruction can also be a trunc store.
32771 if (In.getOpcode() != ISD::SRL)
32774 // A lambda checking the given SDValue is a constant vector and each element
32775 // is in the range [Min, Max].
32776 auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
32777 BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
32778 if (!BV || !BV->isConstant())
32780 for (SDValue Op : V->ops()) {
32781 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
32784 uint64_t Val = C->getZExtValue();
32785 if (Val < Min || Val > Max)
32791 // Check if each element of the vector is left-shifted by one.
32792 auto LHS = In.getOperand(0);
32793 auto RHS = In.getOperand(1);
32794 if (!IsConstVectorInRange(RHS, 1, 1))
32796 if (LHS.getOpcode() != ISD::ADD)
32799 // Detect a pattern of a + b + 1 where the order doesn't matter.
32800 SDValue Operands[3];
32801 Operands[0] = LHS.getOperand(0);
32802 Operands[1] = LHS.getOperand(1);
32804 // Take care of the case when one of the operands is a constant vector whose
32805 // element is in the range [1, 256].
32806 if (IsConstVectorInRange(Operands[1], 1, ScalarVT == MVT::i8 ? 256 : 65536) &&
32807 Operands[0].getOpcode() == ISD::ZERO_EXTEND &&
32808 Operands[0].getOperand(0).getValueType() == VT) {
32809 // The pattern is detected. Subtract one from the constant vector, then
32810 // demote it and emit X86ISD::AVG instruction.
32811 SDValue VecOnes = DAG.getConstant(1, DL, InVT);
32812 Operands[1] = DAG.getNode(ISD::SUB, DL, InVT, Operands[1], VecOnes);
32813 Operands[1] = DAG.getNode(ISD::TRUNCATE, DL, VT, Operands[1]);
32814 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32818 if (Operands[0].getOpcode() == ISD::ADD)
32819 std::swap(Operands[0], Operands[1]);
32820 else if (Operands[1].getOpcode() != ISD::ADD)
32822 Operands[2] = Operands[1].getOperand(0);
32823 Operands[1] = Operands[1].getOperand(1);
32825 // Now we have three operands of two additions. Check that one of them is a
32826 // constant vector with ones, and the other two are promoted from i8/i16.
32827 for (int i = 0; i < 3; ++i) {
32828 if (!IsConstVectorInRange(Operands[i], 1, 1))
32830 std::swap(Operands[i], Operands[2]);
32832 // Check if Operands[0] and Operands[1] are results of type promotion.
32833 for (int j = 0; j < 2; ++j)
32834 if (Operands[j].getOpcode() != ISD::ZERO_EXTEND ||
32835 Operands[j].getOperand(0).getValueType() != VT)
32838 // The pattern is detected, emit X86ISD::AVG instruction.
32839 return DAG.getNode(X86ISD::AVG, DL, VT, Operands[0].getOperand(0),
32840 Operands[1].getOperand(0));
32846 static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
32847 TargetLowering::DAGCombinerInfo &DCI,
32848 const X86Subtarget &Subtarget) {
32849 LoadSDNode *Ld = cast<LoadSDNode>(N);
32850 EVT RegVT = Ld->getValueType(0);
32851 EVT MemVT = Ld->getMemoryVT();
32853 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
32855 // For chips with slow 32-byte unaligned loads, break the 32-byte operation
32856 // into two 16-byte operations. Also split non-temporal aligned loads on
32857 // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
32858 ISD::LoadExtType Ext = Ld->getExtensionType();
32860 unsigned AddressSpace = Ld->getAddressSpace();
32861 unsigned Alignment = Ld->getAlignment();
32862 if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
32863 Ext == ISD::NON_EXTLOAD &&
32864 ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
32865 (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
32866 AddressSpace, Alignment, &Fast) && !Fast))) {
32867 unsigned NumElems = RegVT.getVectorNumElements();
32871 SDValue Ptr = Ld->getBasePtr();
32873 EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
32876 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32877 Alignment, Ld->getMemOperand()->getFlags());
32879 Ptr = DAG.getMemBasePlusOffset(Ptr, 16, dl);
32881 DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
32882 std::min(16U, Alignment), Ld->getMemOperand()->getFlags());
32883 SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
32885 Load2.getValue(1));
32887 SDValue NewVec = DAG.getUNDEF(RegVT);
32888 NewVec = insert128BitVector(NewVec, Load1, 0, DAG, dl);
32889 NewVec = insert128BitVector(NewVec, Load2, NumElems / 2, DAG, dl);
32890 return DCI.CombineTo(N, NewVec, TF, true);
32896 /// If V is a build vector of boolean constants and exactly one of those
32897 /// constants is true, return the operand index of that true element.
32898 /// Otherwise, return -1.
32899 static int getOneTrueElt(SDValue V) {
32900 // This needs to be a build vector of booleans.
32901 // TODO: Checking for the i1 type matches the IR definition for the mask,
32902 // but the mask check could be loosened to i8 or other types. That might
32903 // also require checking more than 'allOnesValue'; eg, the x86 HW
32904 // instructions only require that the MSB is set for each mask element.
32905 // The ISD::MSTORE comments/definition do not specify how the mask operand
32907 auto *BV = dyn_cast<BuildVectorSDNode>(V);
32908 if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
32911 int TrueIndex = -1;
32912 unsigned NumElts = BV->getValueType(0).getVectorNumElements();
32913 for (unsigned i = 0; i < NumElts; ++i) {
32914 const SDValue &Op = BV->getOperand(i);
32917 auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
32920 if (ConstNode->getAPIntValue().isAllOnesValue()) {
32921 // If we already found a one, this is too many.
32922 if (TrueIndex >= 0)
32930 /// Given a masked memory load/store operation, return true if it has one mask
32931 /// bit set. If it has one mask bit set, then also return the memory address of
32932 /// the scalar element to load/store, the vector index to insert/extract that
32933 /// scalar element, and the alignment for the scalar memory access.
32934 static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
32935 SelectionDAG &DAG, SDValue &Addr,
32936 SDValue &Index, unsigned &Alignment) {
32937 int TrueMaskElt = getOneTrueElt(MaskedOp->getMask());
32938 if (TrueMaskElt < 0)
32941 // Get the address of the one scalar element that is specified by the mask
32942 // using the appropriate offset from the base pointer.
32943 EVT EltVT = MaskedOp->getMemoryVT().getVectorElementType();
32944 Addr = MaskedOp->getBasePtr();
32945 if (TrueMaskElt != 0) {
32946 unsigned Offset = TrueMaskElt * EltVT.getStoreSize();
32947 Addr = DAG.getMemBasePlusOffset(Addr, Offset, SDLoc(MaskedOp));
32950 Index = DAG.getIntPtrConstant(TrueMaskElt, SDLoc(MaskedOp));
32951 Alignment = MinAlign(MaskedOp->getAlignment(), EltVT.getStoreSize());
32955 /// If exactly one element of the mask is set for a non-extending masked load,
32956 /// it is a scalar load and vector insert.
32957 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
32958 /// mask have already been optimized in IR, so we don't bother with those here.
32960 reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32961 TargetLowering::DAGCombinerInfo &DCI) {
32962 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
32963 // However, some target hooks may need to be added to know when the transform
32964 // is profitable. Endianness would also have to be considered.
32966 SDValue Addr, VecIndex;
32967 unsigned Alignment;
32968 if (!getParamsForOneTrueMaskedElt(ML, DAG, Addr, VecIndex, Alignment))
32971 // Load the one scalar element that is specified by the mask using the
32972 // appropriate offset from the base pointer.
32974 EVT VT = ML->getValueType(0);
32975 EVT EltVT = VT.getVectorElementType();
32977 DAG.getLoad(EltVT, DL, ML->getChain(), Addr, ML->getPointerInfo(),
32978 Alignment, ML->getMemOperand()->getFlags());
32980 // Insert the loaded element into the appropriate place in the vector.
32981 SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, ML->getSrc0(),
32983 return DCI.CombineTo(ML, Insert, Load.getValue(1), true);
32987 combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
32988 TargetLowering::DAGCombinerInfo &DCI) {
32989 if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
32993 EVT VT = ML->getValueType(0);
32995 // If we are loading the first and last elements of a vector, it is safe and
32996 // always faster to load the whole vector. Replace the masked load with a
32997 // vector load and select.
32998 unsigned NumElts = VT.getVectorNumElements();
32999 BuildVectorSDNode *MaskBV = cast<BuildVectorSDNode>(ML->getMask());
33000 bool LoadFirstElt = !isNullConstant(MaskBV->getOperand(0));
33001 bool LoadLastElt = !isNullConstant(MaskBV->getOperand(NumElts - 1));
33002 if (LoadFirstElt && LoadLastElt) {
33003 SDValue VecLd = DAG.getLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33004 ML->getMemOperand());
33005 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), VecLd, ML->getSrc0());
33006 return DCI.CombineTo(ML, Blend, VecLd.getValue(1), true);
33009 // Convert a masked load with a constant mask into a masked load and a select.
33010 // This allows the select operation to use a faster kind of select instruction
33011 // (for example, vblendvps -> vblendps).
33013 // Don't try this if the pass-through operand is already undefined. That would
33014 // cause an infinite loop because that's what we're about to create.
33015 if (ML->getSrc0().isUndef())
33018 // The new masked load has an undef pass-through operand. The select uses the
33019 // original pass-through operand.
33020 SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
33021 ML->getMask(), DAG.getUNDEF(VT),
33022 ML->getMemoryVT(), ML->getMemOperand(),
33023 ML->getExtensionType());
33024 SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML, ML->getSrc0());
33026 return DCI.CombineTo(ML, Blend, NewML.getValue(1), true);
33029 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
33030 TargetLowering::DAGCombinerInfo &DCI,
33031 const X86Subtarget &Subtarget) {
33032 MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
33034 // TODO: Expanding load with constant mask may be optimized as well.
33035 if (Mld->isExpandingLoad())
33038 if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
33039 if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
33041 // TODO: Do some AVX512 subsets benefit from this transform?
33042 if (!Subtarget.hasAVX512())
33043 if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
33047 if (Mld->getExtensionType() != ISD::SEXTLOAD)
33050 // Resolve extending loads.
33051 EVT VT = Mld->getValueType(0);
33052 unsigned NumElems = VT.getVectorNumElements();
33053 EVT LdVT = Mld->getMemoryVT();
33056 assert(LdVT != VT && "Cannot extend to the same type");
33057 unsigned ToSz = VT.getScalarSizeInBits();
33058 unsigned FromSz = LdVT.getScalarSizeInBits();
33059 // From/To sizes and ElemCount must be pow of two.
33060 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33061 "Unexpected size for extending masked load");
33063 unsigned SizeRatio = ToSz / FromSz;
33064 assert(SizeRatio * NumElems * FromSz == VT.getSizeInBits());
33066 // Create a type on which we perform the shuffle.
33067 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33068 LdVT.getScalarType(), NumElems*SizeRatio);
33069 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33071 // Convert Src0 value.
33072 SDValue WideSrc0 = DAG.getBitcast(WideVecVT, Mld->getSrc0());
33073 if (!Mld->getSrc0().isUndef()) {
33074 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33075 for (unsigned i = 0; i != NumElems; ++i)
33076 ShuffleVec[i] = i * SizeRatio;
33078 // Can't shuffle using an illegal type.
33079 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33080 "WideVecVT should be legal");
33081 WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
33082 DAG.getUNDEF(WideVecVT), ShuffleVec);
33084 // Prepare the new mask.
33086 SDValue Mask = Mld->getMask();
33087 if (Mask.getValueType() == VT) {
33088 // Mask and original value have the same type.
33089 NewMask = DAG.getBitcast(WideVecVT, Mask);
33090 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33091 for (unsigned i = 0; i != NumElems; ++i)
33092 ShuffleVec[i] = i * SizeRatio;
33093 for (unsigned i = NumElems; i != NumElems * SizeRatio; ++i)
33094 ShuffleVec[i] = NumElems * SizeRatio;
33095 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33096 DAG.getConstant(0, dl, WideVecVT),
33099 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33100 unsigned WidenNumElts = NumElems*SizeRatio;
33101 unsigned MaskNumElts = VT.getVectorNumElements();
33102 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33105 unsigned NumConcat = WidenNumElts / MaskNumElts;
33106 SmallVector<SDValue, 16> Ops(NumConcat);
33107 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33109 for (unsigned i = 1; i != NumConcat; ++i)
33112 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33115 SDValue WideLd = DAG.getMaskedLoad(WideVecVT, dl, Mld->getChain(),
33116 Mld->getBasePtr(), NewMask, WideSrc0,
33117 Mld->getMemoryVT(), Mld->getMemOperand(),
33119 SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
33120 return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
33123 /// If exactly one element of the mask is set for a non-truncating masked store,
33124 /// it is a vector extract and scalar store.
33125 /// Note: It is expected that the degenerate cases of an all-zeros or all-ones
33126 /// mask have already been optimized in IR, so we don't bother with those here.
33127 static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
33128 SelectionDAG &DAG) {
33129 // TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
33130 // However, some target hooks may need to be added to know when the transform
33131 // is profitable. Endianness would also have to be considered.
33133 SDValue Addr, VecIndex;
33134 unsigned Alignment;
33135 if (!getParamsForOneTrueMaskedElt(MS, DAG, Addr, VecIndex, Alignment))
33138 // Extract the one scalar element that is actually being stored.
33140 EVT VT = MS->getValue().getValueType();
33141 EVT EltVT = VT.getVectorElementType();
33142 SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
33143 MS->getValue(), VecIndex);
33145 // Store that element at the appropriate offset from the base pointer.
33146 return DAG.getStore(MS->getChain(), DL, Extract, Addr, MS->getPointerInfo(),
33147 Alignment, MS->getMemOperand()->getFlags());
33150 static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
33151 const X86Subtarget &Subtarget) {
33152 MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
33154 if (Mst->isCompressingStore())
33157 if (!Mst->isTruncatingStore())
33158 return reduceMaskedStoreToScalarStore(Mst, DAG);
33160 // Resolve truncating stores.
33161 EVT VT = Mst->getValue().getValueType();
33162 unsigned NumElems = VT.getVectorNumElements();
33163 EVT StVT = Mst->getMemoryVT();
33166 assert(StVT != VT && "Cannot truncate to the same type");
33167 unsigned FromSz = VT.getScalarSizeInBits();
33168 unsigned ToSz = StVT.getScalarSizeInBits();
33170 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33172 // The truncating store is legal in some cases. For example
33173 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33174 // are designated for truncate store.
33175 // In this case we don't need any further transformations.
33176 if (TLI.isTruncStoreLegal(VT, StVT))
33179 // From/To sizes and ElemCount must be pow of two.
33180 assert (isPowerOf2_32(NumElems * FromSz * ToSz) &&
33181 "Unexpected size for truncating masked store");
33182 // We are going to use the original vector elt for storing.
33183 // Accumulated smaller vector elements must be a multiple of the store size.
33184 assert (((NumElems * FromSz) % ToSz) == 0 &&
33185 "Unexpected ratio for truncating masked store");
33187 unsigned SizeRatio = FromSz / ToSz;
33188 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33190 // Create a type on which we perform the shuffle.
33191 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33192 StVT.getScalarType(), NumElems*SizeRatio);
33194 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33196 SDValue WideVec = DAG.getBitcast(WideVecVT, Mst->getValue());
33197 SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
33198 for (unsigned i = 0; i != NumElems; ++i)
33199 ShuffleVec[i] = i * SizeRatio;
33201 // Can't shuffle using an illegal type.
33202 assert(DAG.getTargetLoweringInfo().isTypeLegal(WideVecVT) &&
33203 "WideVecVT should be legal");
33205 SDValue TruncatedVal = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33206 DAG.getUNDEF(WideVecVT),
33210 SDValue Mask = Mst->getMask();
33211 if (Mask.getValueType() == VT) {
33212 // Mask and original value have the same type.
33213 NewMask = DAG.getBitcast(WideVecVT, Mask);
33214 for (unsigned i = 0; i != NumElems; ++i)
33215 ShuffleVec[i] = i * SizeRatio;
33216 for (unsigned i = NumElems; i != NumElems*SizeRatio; ++i)
33217 ShuffleVec[i] = NumElems*SizeRatio;
33218 NewMask = DAG.getVectorShuffle(WideVecVT, dl, NewMask,
33219 DAG.getConstant(0, dl, WideVecVT),
33222 assert(Mask.getValueType().getVectorElementType() == MVT::i1);
33223 unsigned WidenNumElts = NumElems*SizeRatio;
33224 unsigned MaskNumElts = VT.getVectorNumElements();
33225 EVT NewMaskVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
33228 unsigned NumConcat = WidenNumElts / MaskNumElts;
33229 SmallVector<SDValue, 16> Ops(NumConcat);
33230 SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
33232 for (unsigned i = 1; i != NumConcat; ++i)
33235 NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
33238 return DAG.getMaskedStore(Mst->getChain(), dl, TruncatedVal,
33239 Mst->getBasePtr(), NewMask, StVT,
33240 Mst->getMemOperand(), false);
33243 static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
33244 const X86Subtarget &Subtarget) {
33245 StoreSDNode *St = cast<StoreSDNode>(N);
33246 EVT VT = St->getValue().getValueType();
33247 EVT StVT = St->getMemoryVT();
33249 SDValue StoredVal = St->getOperand(1);
33250 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33252 // If we are saving a concatenation of two XMM registers and 32-byte stores
33253 // are slow, such as on Sandy Bridge, perform two 16-byte stores.
33255 unsigned AddressSpace = St->getAddressSpace();
33256 unsigned Alignment = St->getAlignment();
33257 if (VT.is256BitVector() && StVT == VT &&
33258 TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
33259 AddressSpace, Alignment, &Fast) &&
33261 unsigned NumElems = VT.getVectorNumElements();
33265 SDValue Value0 = extract128BitVector(StoredVal, 0, DAG, dl);
33266 SDValue Value1 = extract128BitVector(StoredVal, NumElems / 2, DAG, dl);
33268 SDValue Ptr0 = St->getBasePtr();
33269 SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, 16, dl);
33272 DAG.getStore(St->getChain(), dl, Value0, Ptr0, St->getPointerInfo(),
33273 Alignment, St->getMemOperand()->getFlags());
33275 DAG.getStore(St->getChain(), dl, Value1, Ptr1, St->getPointerInfo(),
33276 std::min(16U, Alignment), St->getMemOperand()->getFlags());
33277 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
33280 // Optimize trunc store (of multiple scalars) to shuffle and store.
33281 // First, pack all of the elements in one place. Next, store to memory
33282 // in fewer chunks.
33283 if (St->isTruncatingStore() && VT.isVector()) {
33284 // Check if we can detect an AVG pattern from the truncation. If yes,
33285 // replace the trunc store by a normal store with the result of X86ISD::AVG
33287 if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
33289 return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
33290 St->getPointerInfo(), St->getAlignment(),
33291 St->getMemOperand()->getFlags());
33294 detectAVX512USatPattern(St->getValue(), St->getMemoryVT(), Subtarget))
33295 return EmitTruncSStore(false /* Unsigned saturation */, St->getChain(),
33296 dl, Val, St->getBasePtr(),
33297 St->getMemoryVT(), St->getMemOperand(), DAG);
33299 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33300 unsigned NumElems = VT.getVectorNumElements();
33301 assert(StVT != VT && "Cannot truncate to the same type");
33302 unsigned FromSz = VT.getScalarSizeInBits();
33303 unsigned ToSz = StVT.getScalarSizeInBits();
33305 // The truncating store is legal in some cases. For example
33306 // vpmovqb, vpmovqw, vpmovqd, vpmovdb, vpmovdw
33307 // are designated for truncate store.
33308 // In this case we don't need any further transformations.
33309 if (TLI.isTruncStoreLegalOrCustom(VT, StVT))
33312 // From, To sizes and ElemCount must be pow of two
33313 if (!isPowerOf2_32(NumElems * FromSz * ToSz)) return SDValue();
33314 // We are going to use the original vector elt for storing.
33315 // Accumulated smaller vector elements must be a multiple of the store size.
33316 if (0 != (NumElems * FromSz) % ToSz) return SDValue();
33318 unsigned SizeRatio = FromSz / ToSz;
33320 assert(SizeRatio * NumElems * ToSz == VT.getSizeInBits());
33322 // Create a type on which we perform the shuffle
33323 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(),
33324 StVT.getScalarType(), NumElems*SizeRatio);
33326 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
33328 SDValue WideVec = DAG.getBitcast(WideVecVT, St->getValue());
33329 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
33330 for (unsigned i = 0; i != NumElems; ++i)
33331 ShuffleVec[i] = i * SizeRatio;
33333 // Can't shuffle using an illegal type.
33334 if (!TLI.isTypeLegal(WideVecVT))
33337 SDValue Shuff = DAG.getVectorShuffle(WideVecVT, dl, WideVec,
33338 DAG.getUNDEF(WideVecVT),
33340 // At this point all of the data is stored at the bottom of the
33341 // register. We now need to save it to mem.
33343 // Find the largest store unit
33344 MVT StoreType = MVT::i8;
33345 for (MVT Tp : MVT::integer_valuetypes()) {
33346 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToSz)
33350 // On 32bit systems, we can't save 64bit integers. Try bitcasting to F64.
33351 if (TLI.isTypeLegal(MVT::f64) && StoreType.getSizeInBits() < 64 &&
33352 (64 <= NumElems * ToSz))
33353 StoreType = MVT::f64;
33355 // Bitcast the original vector into a vector of store-size units
33356 EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(),
33357 StoreType, VT.getSizeInBits()/StoreType.getSizeInBits());
33358 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
33359 SDValue ShuffWide = DAG.getBitcast(StoreVecVT, Shuff);
33360 SmallVector<SDValue, 8> Chains;
33361 SDValue Ptr = St->getBasePtr();
33363 // Perform one or more big stores into memory.
33364 for (unsigned i=0, e=(ToSz*NumElems)/StoreType.getSizeInBits(); i!=e; ++i) {
33365 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
33366 StoreType, ShuffWide,
33367 DAG.getIntPtrConstant(i, dl));
33369 DAG.getStore(St->getChain(), dl, SubVec, Ptr, St->getPointerInfo(),
33370 St->getAlignment(), St->getMemOperand()->getFlags());
33371 Ptr = DAG.getMemBasePlusOffset(Ptr, StoreType.getStoreSize(), dl);
33372 Chains.push_back(Ch);
33375 return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
33378 // Turn load->store of MMX types into GPR load/stores. This avoids clobbering
33379 // the FP state in cases where an emms may be missing.
33380 // A preferable solution to the general problem is to figure out the right
33381 // places to insert EMMS. This qualifies as a quick hack.
33383 // Similarly, turn load->store of i64 into double load/stores in 32-bit mode.
33384 if (VT.getSizeInBits() != 64)
33387 const Function *F = DAG.getMachineFunction().getFunction();
33388 bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
33390 !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
33391 if ((VT.isVector() ||
33392 (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit())) &&
33393 isa<LoadSDNode>(St->getValue()) &&
33394 !cast<LoadSDNode>(St->getValue())->isVolatile() &&
33395 St->getChain().hasOneUse() && !St->isVolatile()) {
33396 SDNode* LdVal = St->getValue().getNode();
33397 LoadSDNode *Ld = nullptr;
33398 int TokenFactorIndex = -1;
33399 SmallVector<SDValue, 8> Ops;
33400 SDNode* ChainVal = St->getChain().getNode();
33401 // Must be a store of a load. We currently handle two cases: the load
33402 // is a direct child, and it's under an intervening TokenFactor. It is
33403 // possible to dig deeper under nested TokenFactors.
33404 if (ChainVal == LdVal)
33405 Ld = cast<LoadSDNode>(St->getChain());
33406 else if (St->getValue().hasOneUse() &&
33407 ChainVal->getOpcode() == ISD::TokenFactor) {
33408 for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
33409 if (ChainVal->getOperand(i).getNode() == LdVal) {
33410 TokenFactorIndex = i;
33411 Ld = cast<LoadSDNode>(St->getValue());
33413 Ops.push_back(ChainVal->getOperand(i));
33417 if (!Ld || !ISD::isNormalLoad(Ld))
33420 // If this is not the MMX case, i.e. we are just turning i64 load/store
33421 // into f64 load/store, avoid the transformation if there are multiple
33422 // uses of the loaded value.
33423 if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
33428 // If we are a 64-bit capable x86, lower to a single movq load/store pair.
33429 // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
33431 if (Subtarget.is64Bit() || F64IsLegal) {
33432 MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
33433 SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
33434 Ld->getPointerInfo(), Ld->getAlignment(),
33435 Ld->getMemOperand()->getFlags());
33436 // Make sure new load is placed in same chain order.
33437 SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
33438 if (TokenFactorIndex >= 0) {
33439 Ops.push_back(NewChain);
33440 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33442 return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
33443 St->getPointerInfo(), St->getAlignment(),
33444 St->getMemOperand()->getFlags());
33447 // Otherwise, lower to two pairs of 32-bit loads / stores.
33448 SDValue LoAddr = Ld->getBasePtr();
33449 SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
33451 SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
33452 Ld->getPointerInfo(), Ld->getAlignment(),
33453 Ld->getMemOperand()->getFlags());
33454 SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
33455 Ld->getPointerInfo().getWithOffset(4),
33456 MinAlign(Ld->getAlignment(), 4),
33457 Ld->getMemOperand()->getFlags());
33458 // Make sure new loads are placed in same chain order.
33459 SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
33460 NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
33462 if (TokenFactorIndex >= 0) {
33463 Ops.push_back(NewChain);
33464 NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
33467 LoAddr = St->getBasePtr();
33468 HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
33471 DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
33472 St->getAlignment(), St->getMemOperand()->getFlags());
33473 SDValue HiSt = DAG.getStore(
33474 NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
33475 MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
33476 return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
33479 // This is similar to the above case, but here we handle a scalar 64-bit
33480 // integer store that is extracted from a vector on a 32-bit target.
33481 // If we have SSE2, then we can treat it like a floating-point double
33482 // to get past legalization. The execution dependencies fixup pass will
33483 // choose the optimal machine instruction for the store if this really is
33484 // an integer or v2f32 rather than an f64.
33485 if (VT == MVT::i64 && F64IsLegal && !Subtarget.is64Bit() &&
33486 St->getOperand(1).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
33487 SDValue OldExtract = St->getOperand(1);
33488 SDValue ExtOp0 = OldExtract.getOperand(0);
33489 unsigned VecSize = ExtOp0.getValueSizeInBits();
33490 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, VecSize / 64);
33491 SDValue BitCast = DAG.getBitcast(VecVT, ExtOp0);
33492 SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
33493 BitCast, OldExtract.getOperand(1));
33494 return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
33495 St->getPointerInfo(), St->getAlignment(),
33496 St->getMemOperand()->getFlags());
33502 /// Return 'true' if this vector operation is "horizontal"
33503 /// and return the operands for the horizontal operation in LHS and RHS. A
33504 /// horizontal operation performs the binary operation on successive elements
33505 /// of its first operand, then on successive elements of its second operand,
33506 /// returning the resulting values in a vector. For example, if
33507 /// A = < float a0, float a1, float a2, float a3 >
33509 /// B = < float b0, float b1, float b2, float b3 >
33510 /// then the result of doing a horizontal operation on A and B is
33511 /// A horizontal-op B = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >.
33512 /// In short, LHS and RHS are inspected to see if LHS op RHS is of the form
33513 /// A horizontal-op B, for some already available A and B, and if so then LHS is
33514 /// set to A, RHS to B, and the routine returns 'true'.
33515 /// Note that the binary operation should have the property that if one of the
33516 /// operands is UNDEF then the result is UNDEF.
33517 static bool isHorizontalBinOp(SDValue &LHS, SDValue &RHS, bool IsCommutative) {
33518 // Look for the following pattern: if
33519 // A = < float a0, float a1, float a2, float a3 >
33520 // B = < float b0, float b1, float b2, float b3 >
33522 // LHS = VECTOR_SHUFFLE A, B, <0, 2, 4, 6>
33523 // RHS = VECTOR_SHUFFLE A, B, <1, 3, 5, 7>
33524 // then LHS op RHS = < a0 op a1, a2 op a3, b0 op b1, b2 op b3 >
33525 // which is A horizontal-op B.
33527 // At least one of the operands should be a vector shuffle.
33528 if (LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
33529 RHS.getOpcode() != ISD::VECTOR_SHUFFLE)
33532 MVT VT = LHS.getSimpleValueType();
33534 assert((VT.is128BitVector() || VT.is256BitVector()) &&
33535 "Unsupported vector type for horizontal add/sub");
33537 // Handle 128 and 256-bit vector lengths. AVX defines horizontal add/sub to
33538 // operate independently on 128-bit lanes.
33539 unsigned NumElts = VT.getVectorNumElements();
33540 unsigned NumLanes = VT.getSizeInBits()/128;
33541 unsigned NumLaneElts = NumElts / NumLanes;
33542 assert((NumLaneElts % 2 == 0) &&
33543 "Vector type should have an even number of elements in each lane");
33544 unsigned HalfLaneElts = NumLaneElts/2;
33546 // View LHS in the form
33547 // LHS = VECTOR_SHUFFLE A, B, LMask
33548 // If LHS is not a shuffle then pretend it is the shuffle
33549 // LHS = VECTOR_SHUFFLE LHS, undef, <0, 1, ..., N-1>
33550 // NOTE: in what follows a default initialized SDValue represents an UNDEF of
33553 SmallVector<int, 16> LMask(NumElts);
33554 if (LHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33555 if (!LHS.getOperand(0).isUndef())
33556 A = LHS.getOperand(0);
33557 if (!LHS.getOperand(1).isUndef())
33558 B = LHS.getOperand(1);
33559 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(LHS.getNode())->getMask();
33560 std::copy(Mask.begin(), Mask.end(), LMask.begin());
33562 if (!LHS.isUndef())
33564 for (unsigned i = 0; i != NumElts; ++i)
33568 // Likewise, view RHS in the form
33569 // RHS = VECTOR_SHUFFLE C, D, RMask
33571 SmallVector<int, 16> RMask(NumElts);
33572 if (RHS.getOpcode() == ISD::VECTOR_SHUFFLE) {
33573 if (!RHS.getOperand(0).isUndef())
33574 C = RHS.getOperand(0);
33575 if (!RHS.getOperand(1).isUndef())
33576 D = RHS.getOperand(1);
33577 ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(RHS.getNode())->getMask();
33578 std::copy(Mask.begin(), Mask.end(), RMask.begin());
33580 if (!RHS.isUndef())
33582 for (unsigned i = 0; i != NumElts; ++i)
33586 // Check that the shuffles are both shuffling the same vectors.
33587 if (!(A == C && B == D) && !(A == D && B == C))
33590 // If everything is UNDEF then bail out: it would be better to fold to UNDEF.
33591 if (!A.getNode() && !B.getNode())
33594 // If A and B occur in reverse order in RHS, then "swap" them (which means
33595 // rewriting the mask).
33597 ShuffleVectorSDNode::commuteMask(RMask);
33599 // At this point LHS and RHS are equivalent to
33600 // LHS = VECTOR_SHUFFLE A, B, LMask
33601 // RHS = VECTOR_SHUFFLE A, B, RMask
33602 // Check that the masks correspond to performing a horizontal operation.
33603 for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
33604 for (unsigned i = 0; i != NumLaneElts; ++i) {
33605 int LIdx = LMask[i+l], RIdx = RMask[i+l];
33607 // Ignore any UNDEF components.
33608 if (LIdx < 0 || RIdx < 0 ||
33609 (!A.getNode() && (LIdx < (int)NumElts || RIdx < (int)NumElts)) ||
33610 (!B.getNode() && (LIdx >= (int)NumElts || RIdx >= (int)NumElts)))
33613 // Check that successive elements are being operated on. If not, this is
33614 // not a horizontal operation.
33615 unsigned Src = (i/HalfLaneElts); // each lane is split between srcs
33616 int Index = 2*(i%HalfLaneElts) + NumElts*Src + l;
33617 if (!(LIdx == Index && RIdx == Index + 1) &&
33618 !(IsCommutative && LIdx == Index + 1 && RIdx == Index))
33623 LHS = A.getNode() ? A : B; // If A is 'UNDEF', use B for it.
33624 RHS = B.getNode() ? B : A; // If B is 'UNDEF', use A for it.
33628 /// Do target-specific dag combines on floating-point adds/subs.
33629 static SDValue combineFaddFsub(SDNode *N, SelectionDAG &DAG,
33630 const X86Subtarget &Subtarget) {
33631 EVT VT = N->getValueType(0);
33632 SDValue LHS = N->getOperand(0);
33633 SDValue RHS = N->getOperand(1);
33634 bool IsFadd = N->getOpcode() == ISD::FADD;
33635 assert((IsFadd || N->getOpcode() == ISD::FSUB) && "Wrong opcode");
33637 // Try to synthesize horizontal add/sub from adds/subs of shuffles.
33638 if (((Subtarget.hasSSE3() && (VT == MVT::v4f32 || VT == MVT::v2f64)) ||
33639 (Subtarget.hasFp256() && (VT == MVT::v8f32 || VT == MVT::v4f64))) &&
33640 isHorizontalBinOp(LHS, RHS, IsFadd)) {
33641 auto NewOpcode = IsFadd ? X86ISD::FHADD : X86ISD::FHSUB;
33642 return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
33647 /// Attempt to pre-truncate inputs to arithmetic ops if it will simplify
33649 /// e.g. TRUNC( BINOP( X, Y ) ) --> BINOP( TRUNC( X ), TRUNC( Y ) )
33650 static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
33651 const X86Subtarget &Subtarget,
33653 assert(N->getOpcode() == ISD::TRUNCATE && "Wrong opcode");
33654 SDValue Src = N->getOperand(0);
33655 unsigned Opcode = Src.getOpcode();
33656 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
33658 EVT VT = N->getValueType(0);
33659 EVT SrcVT = Src.getValueType();
33661 auto IsRepeatedOpOrFreeTruncation = [VT](SDValue Op0, SDValue Op1) {
33662 unsigned TruncSizeInBits = VT.getScalarSizeInBits();
33664 // Repeated operand, so we are only trading one output truncation for
33665 // one input truncation.
33669 // See if either operand has been extended from a smaller/equal size to
33670 // the truncation size, allowing a truncation to combine with the extend.
33671 unsigned Opcode0 = Op0.getOpcode();
33672 if ((Opcode0 == ISD::ANY_EXTEND || Opcode0 == ISD::SIGN_EXTEND ||
33673 Opcode0 == ISD::ZERO_EXTEND) &&
33674 Op0.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33677 unsigned Opcode1 = Op1.getOpcode();
33678 if ((Opcode1 == ISD::ANY_EXTEND || Opcode1 == ISD::SIGN_EXTEND ||
33679 Opcode1 == ISD::ZERO_EXTEND) &&
33680 Op1.getOperand(0).getScalarValueSizeInBits() <= TruncSizeInBits)
33683 // See if either operand is a single use constant which can be constant
33685 SDValue BC0 = peekThroughOneUseBitcasts(Op0);
33686 SDValue BC1 = peekThroughOneUseBitcasts(Op1);
33687 return ISD::isBuildVectorOfConstantSDNodes(BC0.getNode()) ||
33688 ISD::isBuildVectorOfConstantSDNodes(BC1.getNode());
33691 auto TruncateArithmetic = [&](SDValue N0, SDValue N1) {
33692 SDValue Trunc0 = DAG.getNode(ISD::TRUNCATE, DL, VT, N0);
33693 SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
33694 return DAG.getNode(Opcode, DL, VT, Trunc0, Trunc1);
33697 // Don't combine if the operation has other uses.
33698 if (!N->isOnlyUserOf(Src.getNode()))
33701 // Only support vector truncation for now.
33702 // TODO: i64 scalar math would benefit as well.
33703 if (!VT.isVector())
33706 // In most cases its only worth pre-truncating if we're only facing the cost
33707 // of one truncation.
33708 // i.e. if one of the inputs will constant fold or the input is repeated.
33713 SDValue Op0 = Src.getOperand(0);
33714 SDValue Op1 = Src.getOperand(1);
33715 if (TLI.isOperationLegalOrPromote(Opcode, VT) &&
33716 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33717 return TruncateArithmetic(Op0, Op1);
33722 // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
33723 // better to truncate if we have the chance.
33724 if (SrcVT.getScalarType() == MVT::i64 && TLI.isOperationLegal(Opcode, VT) &&
33725 !TLI.isOperationLegal(Opcode, SrcVT))
33726 return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
33729 SDValue Op0 = Src.getOperand(0);
33730 SDValue Op1 = Src.getOperand(1);
33731 if (TLI.isOperationLegal(Opcode, VT) &&
33732 IsRepeatedOpOrFreeTruncation(Op0, Op1))
33733 return TruncateArithmetic(Op0, Op1);
33741 /// Truncate a group of v4i32 into v16i8/v8i16 using X86ISD::PACKUS.
33743 combineVectorTruncationWithPACKUS(SDNode *N, SelectionDAG &DAG,
33744 SmallVector<SDValue, 8> &Regs) {
33745 assert(Regs.size() > 0 && (Regs[0].getValueType() == MVT::v4i32 ||
33746 Regs[0].getValueType() == MVT::v2i64));
33747 EVT OutVT = N->getValueType(0);
33748 EVT OutSVT = OutVT.getVectorElementType();
33749 EVT InVT = Regs[0].getValueType();
33750 EVT InSVT = InVT.getVectorElementType();
33753 // First, use mask to unset all bits that won't appear in the result.
33754 assert((OutSVT == MVT::i8 || OutSVT == MVT::i16) &&
33755 "OutSVT can only be either i8 or i16.");
33757 APInt::getLowBitsSet(InSVT.getSizeInBits(), OutSVT.getSizeInBits());
33758 SDValue MaskVal = DAG.getConstant(Mask, DL, InVT);
33759 for (auto &Reg : Regs)
33760 Reg = DAG.getNode(ISD::AND, DL, InVT, MaskVal, Reg);
33762 MVT UnpackedVT, PackedVT;
33763 if (OutSVT == MVT::i8) {
33764 UnpackedVT = MVT::v8i16;
33765 PackedVT = MVT::v16i8;
33767 UnpackedVT = MVT::v4i32;
33768 PackedVT = MVT::v8i16;
33771 // In each iteration, truncate the type by a half size.
33772 auto RegNum = Regs.size();
33773 for (unsigned j = 1, e = InSVT.getSizeInBits() / OutSVT.getSizeInBits();
33774 j < e; j *= 2, RegNum /= 2) {
33775 for (unsigned i = 0; i < RegNum; i++)
33776 Regs[i] = DAG.getBitcast(UnpackedVT, Regs[i]);
33777 for (unsigned i = 0; i < RegNum / 2; i++)
33778 Regs[i] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[i * 2],
33782 // If the type of the result is v8i8, we need do one more X86ISD::PACKUS, and
33783 // then extract a subvector as the result since v8i8 is not a legal type.
33784 if (OutVT == MVT::v8i8) {
33785 Regs[0] = DAG.getNode(X86ISD::PACKUS, DL, PackedVT, Regs[0], Regs[0]);
33786 Regs[0] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OutVT, Regs[0],
33787 DAG.getIntPtrConstant(0, DL));
33789 } else if (RegNum > 1) {
33790 Regs.resize(RegNum);
33791 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33796 /// Truncate a group of v4i32 into v8i16 using X86ISD::PACKSS.
33798 combineVectorTruncationWithPACKSS(SDNode *N, const X86Subtarget &Subtarget,
33800 SmallVector<SDValue, 8> &Regs) {
33801 assert(Regs.size() > 0 && Regs[0].getValueType() == MVT::v4i32);
33802 EVT OutVT = N->getValueType(0);
33805 // Shift left by 16 bits, then arithmetic-shift right by 16 bits.
33806 SDValue ShAmt = DAG.getConstant(16, DL, MVT::i32);
33807 for (auto &Reg : Regs) {
33808 Reg = getTargetVShiftNode(X86ISD::VSHLI, DL, MVT::v4i32, Reg, ShAmt,
33810 Reg = getTargetVShiftNode(X86ISD::VSRAI, DL, MVT::v4i32, Reg, ShAmt,
33814 for (unsigned i = 0, e = Regs.size() / 2; i < e; i++)
33815 Regs[i] = DAG.getNode(X86ISD::PACKSS, DL, MVT::v8i16, Regs[i * 2],
33818 if (Regs.size() > 2) {
33819 Regs.resize(Regs.size() / 2);
33820 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Regs);
33825 /// This function transforms truncation from vXi32/vXi64 to vXi8/vXi16 into
33826 /// X86ISD::PACKUS/X86ISD::PACKSS operations. We do it here because after type
33827 /// legalization the truncation will be translated into a BUILD_VECTOR with each
33828 /// element that is extracted from a vector and then truncated, and it is
33829 /// difficult to do this optimization based on them.
33830 static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
33831 const X86Subtarget &Subtarget) {
33832 EVT OutVT = N->getValueType(0);
33833 if (!OutVT.isVector())
33836 SDValue In = N->getOperand(0);
33837 if (!In.getValueType().isSimple())
33840 EVT InVT = In.getValueType();
33841 unsigned NumElems = OutVT.getVectorNumElements();
33843 // TODO: On AVX2, the behavior of X86ISD::PACKUS is different from that on
33844 // SSE2, and we need to take care of it specially.
33845 // AVX512 provides vpmovdb.
33846 if (!Subtarget.hasSSE2() || Subtarget.hasAVX2())
33849 EVT OutSVT = OutVT.getVectorElementType();
33850 EVT InSVT = InVT.getVectorElementType();
33851 if (!((InSVT == MVT::i32 || InSVT == MVT::i64) &&
33852 (OutSVT == MVT::i8 || OutSVT == MVT::i16) && isPowerOf2_32(NumElems) &&
33856 // SSSE3's pshufb results in less instructions in the cases below.
33857 if (Subtarget.hasSSSE3() && NumElems == 8 &&
33858 ((OutSVT == MVT::i8 && InSVT != MVT::i64) ||
33859 (InSVT == MVT::i32 && OutSVT == MVT::i16)))
33864 // Split a long vector into vectors of legal type.
33865 unsigned RegNum = InVT.getSizeInBits() / 128;
33866 SmallVector<SDValue, 8> SubVec(RegNum);
33867 unsigned NumSubRegElts = 128 / InSVT.getSizeInBits();
33868 EVT SubRegVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubRegElts);
33870 for (unsigned i = 0; i < RegNum; i++)
33871 SubVec[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubRegVT, In,
33872 DAG.getIntPtrConstant(i * NumSubRegElts, DL));
33874 // SSE2 provides PACKUS for only 2 x v8i16 -> v16i8 and SSE4.1 provides PACKUS
33875 // for 2 x v4i32 -> v8i16. For SSSE3 and below, we need to use PACKSS to
33876 // truncate 2 x v4i32 to v8i16.
33877 if (Subtarget.hasSSE41() || OutSVT == MVT::i8)
33878 return combineVectorTruncationWithPACKUS(N, DAG, SubVec);
33879 else if (InSVT == MVT::i32)
33880 return combineVectorTruncationWithPACKSS(N, Subtarget, DAG, SubVec);
33885 /// This function transforms vector truncation of 'all or none' bits values.
33886 /// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
33887 static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
33889 const X86Subtarget &Subtarget) {
33890 // Requires SSE2 but AVX512 has fast truncate.
33891 if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
33894 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple())
33897 SDValue In = N->getOperand(0);
33898 if (!In.getValueType().isSimple())
33901 MVT VT = N->getValueType(0).getSimpleVT();
33902 MVT SVT = VT.getScalarType();
33904 MVT InVT = In.getValueType().getSimpleVT();
33905 MVT InSVT = InVT.getScalarType();
33907 // Use PACKSS if the input is a splatted sign bit.
33908 // e.g. Comparison result, sext_in_reg, etc.
33909 unsigned NumSignBits = DAG.ComputeNumSignBits(In);
33910 if (NumSignBits != InSVT.getSizeInBits())
33913 // Check we have a truncation suited for PACKSS.
33914 if (!VT.is128BitVector() && !VT.is256BitVector())
33916 if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
33918 if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
33921 return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
33924 static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
33925 const X86Subtarget &Subtarget) {
33926 EVT VT = N->getValueType(0);
33927 SDValue Src = N->getOperand(0);
33930 // Attempt to pre-truncate inputs to arithmetic ops instead.
33931 if (SDValue V = combineTruncatedArithmetic(N, DAG, Subtarget, DL))
33934 // Try to detect AVG pattern first.
33935 if (SDValue Avg = detectAVGPattern(Src, VT, DAG, Subtarget, DL))
33938 // Try to combine truncation with unsigned saturation.
33939 if (SDValue Val = combineTruncateWithUSat(Src, VT, DL, DAG, Subtarget))
33942 // The bitcast source is a direct mmx result.
33943 // Detect bitcasts between i32 to x86mmx
33944 if (Src.getOpcode() == ISD::BITCAST && VT == MVT::i32) {
33945 SDValue BCSrc = Src.getOperand(0);
33946 if (BCSrc.getValueType() == MVT::x86mmx)
33947 return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
33950 // Try to truncate extended sign bits with PACKSS.
33951 if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
33954 return combineVectorTruncation(N, DAG, Subtarget);
33957 /// Returns the negated value if the node \p N flips sign of FP value.
33959 /// FP-negation node may have different forms: FNEG(x) or FXOR (x, 0x80000000).
33960 /// AVX512F does not have FXOR, so FNEG is lowered as
33961 /// (bitcast (xor (bitcast x), (bitcast ConstantFP(0x80000000)))).
33962 /// In this case we go though all bitcasts.
33963 static SDValue isFNEG(SDNode *N) {
33964 if (N->getOpcode() == ISD::FNEG)
33965 return N->getOperand(0);
33967 SDValue Op = peekThroughBitcasts(SDValue(N, 0));
33968 if (Op.getOpcode() != X86ISD::FXOR && Op.getOpcode() != ISD::XOR)
33971 SDValue Op1 = peekThroughBitcasts(Op.getOperand(1));
33972 if (!Op1.getValueType().isFloatingPoint())
33975 SDValue Op0 = peekThroughBitcasts(Op.getOperand(0));
33977 unsigned EltBits = Op1.getScalarValueSizeInBits();
33978 auto isSignMask = [&](const ConstantFP *C) {
33979 return C->getValueAPF().bitcastToAPInt() == APInt::getSignMask(EltBits);
33982 // There is more than one way to represent the same constant on
33983 // the different X86 targets. The type of the node may also depend on size.
33984 // - load scalar value and broadcast
33985 // - BUILD_VECTOR node
33986 // - load from a constant pool.
33987 // We check all variants here.
33988 if (Op1.getOpcode() == X86ISD::VBROADCAST) {
33989 if (auto *C = getTargetConstantFromNode(Op1.getOperand(0)))
33990 if (isSignMask(cast<ConstantFP>(C)))
33993 } else if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op1)) {
33994 if (ConstantFPSDNode *CN = BV->getConstantFPSplatNode())
33995 if (isSignMask(CN->getConstantFPValue()))
33998 } else if (auto *C = getTargetConstantFromNode(Op1)) {
33999 if (C->getType()->isVectorTy()) {
34000 if (auto *SplatV = C->getSplatValue())
34001 if (isSignMask(cast<ConstantFP>(SplatV)))
34003 } else if (auto *FPConst = dyn_cast<ConstantFP>(C))
34004 if (isSignMask(FPConst))
34010 /// Do target-specific dag combines on floating point negations.
34011 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
34012 const X86Subtarget &Subtarget) {
34013 EVT OrigVT = N->getValueType(0);
34014 SDValue Arg = isFNEG(N);
34015 assert(Arg.getNode() && "N is expected to be an FNEG node");
34017 EVT VT = Arg.getValueType();
34018 EVT SVT = VT.getScalarType();
34021 // Let legalize expand this if it isn't a legal type yet.
34022 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34025 // If we're negating a FMUL node on a target with FMA, then we can avoid the
34026 // use of a constant by performing (-0 - A*B) instead.
34027 // FIXME: Check rounding control flags as well once it becomes available.
34028 if (Arg.getOpcode() == ISD::FMUL && (SVT == MVT::f32 || SVT == MVT::f64) &&
34029 Arg->getFlags().hasNoSignedZeros() && Subtarget.hasAnyFMA()) {
34030 SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
34031 SDValue NewNode = DAG.getNode(X86ISD::FNMSUB, DL, VT, Arg.getOperand(0),
34032 Arg.getOperand(1), Zero);
34033 return DAG.getBitcast(OrigVT, NewNode);
34036 // If we're negating an FMA node, then we can adjust the
34037 // instruction to include the extra negation.
34038 unsigned NewOpcode = 0;
34039 if (Arg.hasOneUse()) {
34040 switch (Arg.getOpcode()) {
34041 case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
34042 case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
34043 case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
34044 case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
34045 case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
34046 case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
34047 case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
34048 case X86ISD::FNMSUB_RND: NewOpcode = X86ISD::FMADD_RND; break;
34049 // We can't handle scalar intrinsic node here because it would only
34050 // invert one element and not the whole vector. But we could try to handle
34051 // a negation of the lower element only.
34055 return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT,
34056 Arg.getNode()->ops()));
34061 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
34062 const X86Subtarget &Subtarget) {
34063 MVT VT = N->getSimpleValueType(0);
34064 // If we have integer vector types available, use the integer opcodes.
34065 if (VT.isVector() && Subtarget.hasSSE2()) {
34068 MVT IntVT = MVT::getVectorVT(MVT::i64, VT.getSizeInBits() / 64);
34070 SDValue Op0 = DAG.getBitcast(IntVT, N->getOperand(0));
34071 SDValue Op1 = DAG.getBitcast(IntVT, N->getOperand(1));
34072 unsigned IntOpcode;
34073 switch (N->getOpcode()) {
34074 default: llvm_unreachable("Unexpected FP logic op");
34075 case X86ISD::FOR: IntOpcode = ISD::OR; break;
34076 case X86ISD::FXOR: IntOpcode = ISD::XOR; break;
34077 case X86ISD::FAND: IntOpcode = ISD::AND; break;
34078 case X86ISD::FANDN: IntOpcode = X86ISD::ANDNP; break;
34080 SDValue IntOp = DAG.getNode(IntOpcode, dl, IntVT, Op0, Op1);
34081 return DAG.getBitcast(VT, IntOp);
34086 static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
34087 TargetLowering::DAGCombinerInfo &DCI,
34088 const X86Subtarget &Subtarget) {
34089 if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
34092 if (DCI.isBeforeLegalizeOps())
34095 if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
34098 if (Subtarget.hasCMov())
34099 if (SDValue RV = combineIntegerAbs(N, DAG))
34102 if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
34106 return combineFneg(N, DAG, Subtarget);
34111 static bool isNullFPScalarOrVectorConst(SDValue V) {
34112 return isNullFPConstant(V) || ISD::isBuildVectorAllZeros(V.getNode());
34115 /// If a value is a scalar FP zero or a vector FP zero (potentially including
34116 /// undefined elements), return a zero constant that may be used to fold away
34117 /// that value. In the case of a vector, the returned constant will not contain
34118 /// undefined elements even if the input parameter does. This makes it suitable
34119 /// to be used as a replacement operand with operations (eg, bitwise-and) where
34120 /// an undef should not propagate.
34121 static SDValue getNullFPConstForNullVal(SDValue V, SelectionDAG &DAG,
34122 const X86Subtarget &Subtarget) {
34123 if (!isNullFPScalarOrVectorConst(V))
34126 if (V.getValueType().isVector())
34127 return getZeroVector(V.getSimpleValueType(), Subtarget, DAG, SDLoc(V));
34132 static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
34133 const X86Subtarget &Subtarget) {
34134 SDValue N0 = N->getOperand(0);
34135 SDValue N1 = N->getOperand(1);
34136 EVT VT = N->getValueType(0);
34139 // Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
34140 if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
34141 (VT == MVT::f64 && Subtarget.hasSSE2())))
34144 auto isAllOnesConstantFP = [](SDValue V) {
34145 auto *C = dyn_cast<ConstantFPSDNode>(V);
34146 return C && C->getConstantFPValue()->isAllOnesValue();
34149 // fand (fxor X, -1), Y --> fandn X, Y
34150 if (N0.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N0.getOperand(1)))
34151 return DAG.getNode(X86ISD::FANDN, DL, VT, N0.getOperand(0), N1);
34153 // fand X, (fxor Y, -1) --> fandn Y, X
34154 if (N1.getOpcode() == X86ISD::FXOR && isAllOnesConstantFP(N1.getOperand(1)))
34155 return DAG.getNode(X86ISD::FANDN, DL, VT, N1.getOperand(0), N0);
34160 /// Do target-specific dag combines on X86ISD::FAND nodes.
34161 static SDValue combineFAnd(SDNode *N, SelectionDAG &DAG,
34162 const X86Subtarget &Subtarget) {
34163 // FAND(0.0, x) -> 0.0
34164 if (SDValue V = getNullFPConstForNullVal(N->getOperand(0), DAG, Subtarget))
34167 // FAND(x, 0.0) -> 0.0
34168 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34171 if (SDValue V = combineFAndFNotToFAndn(N, DAG, Subtarget))
34174 return lowerX86FPLogicOp(N, DAG, Subtarget);
34177 /// Do target-specific dag combines on X86ISD::FANDN nodes.
34178 static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
34179 const X86Subtarget &Subtarget) {
34180 // FANDN(0.0, x) -> x
34181 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34182 return N->getOperand(1);
34184 // FANDN(x, 0.0) -> 0.0
34185 if (SDValue V = getNullFPConstForNullVal(N->getOperand(1), DAG, Subtarget))
34188 return lowerX86FPLogicOp(N, DAG, Subtarget);
34191 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
34192 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
34193 const X86Subtarget &Subtarget) {
34194 assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
34196 // F[X]OR(0.0, x) -> x
34197 if (isNullFPScalarOrVectorConst(N->getOperand(0)))
34198 return N->getOperand(1);
34200 // F[X]OR(x, 0.0) -> x
34201 if (isNullFPScalarOrVectorConst(N->getOperand(1)))
34202 return N->getOperand(0);
34205 if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
34208 return lowerX86FPLogicOp(N, DAG, Subtarget);
34211 /// Do target-specific dag combines on X86ISD::FMIN and X86ISD::FMAX nodes.
34212 static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
34213 assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
34215 // Only perform optimizations if UnsafeMath is used.
34216 if (!DAG.getTarget().Options.UnsafeFPMath)
34219 // If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
34220 // into FMINC and FMAXC, which are Commutative operations.
34221 unsigned NewOp = 0;
34222 switch (N->getOpcode()) {
34223 default: llvm_unreachable("unknown opcode");
34224 case X86ISD::FMIN: NewOp = X86ISD::FMINC; break;
34225 case X86ISD::FMAX: NewOp = X86ISD::FMAXC; break;
34228 return DAG.getNode(NewOp, SDLoc(N), N->getValueType(0),
34229 N->getOperand(0), N->getOperand(1));
34232 static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
34233 const X86Subtarget &Subtarget) {
34234 if (Subtarget.useSoftFloat())
34237 // TODO: Check for global or instruction-level "nnan". In that case, we
34238 // should be able to lower to FMAX/FMIN alone.
34239 // TODO: If an operand is already known to be a NaN or not a NaN, this
34240 // should be an optional swap and FMAX/FMIN.
34242 EVT VT = N->getValueType(0);
34243 if (!((Subtarget.hasSSE1() && (VT == MVT::f32 || VT == MVT::v4f32)) ||
34244 (Subtarget.hasSSE2() && (VT == MVT::f64 || VT == MVT::v2f64)) ||
34245 (Subtarget.hasAVX() && (VT == MVT::v8f32 || VT == MVT::v4f64))))
34248 // This takes at least 3 instructions, so favor a library call when operating
34249 // on a scalar and minimizing code size.
34250 if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
34253 SDValue Op0 = N->getOperand(0);
34254 SDValue Op1 = N->getOperand(1);
34256 EVT SetCCType = DAG.getTargetLoweringInfo().getSetCCResultType(
34257 DAG.getDataLayout(), *DAG.getContext(), VT);
34259 // There are 4 possibilities involving NaN inputs, and these are the required
34263 // ----------------
34264 // Num | Max | Op0 |
34265 // Op0 ----------------
34266 // NaN | Op1 | NaN |
34267 // ----------------
34269 // The SSE FP max/min instructions were not designed for this case, but rather
34271 // Min = Op1 < Op0 ? Op1 : Op0
34272 // Max = Op1 > Op0 ? Op1 : Op0
34274 // So they always return Op0 if either input is a NaN. However, we can still
34275 // use those instructions for fmaxnum by selecting away a NaN input.
34277 // If either operand is NaN, the 2nd source operand (Op0) is passed through.
34278 auto MinMaxOp = N->getOpcode() == ISD::FMAXNUM ? X86ISD::FMAX : X86ISD::FMIN;
34279 SDValue MinOrMax = DAG.getNode(MinMaxOp, DL, VT, Op1, Op0);
34280 SDValue IsOp0Nan = DAG.getSetCC(DL, SetCCType , Op0, Op0, ISD::SETUO);
34282 // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands
34283 // are NaN, the NaN value of Op1 is the result.
34284 return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax);
34287 /// Do target-specific dag combines on X86ISD::ANDNP nodes.
34288 static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
34289 TargetLowering::DAGCombinerInfo &DCI,
34290 const X86Subtarget &Subtarget) {
34291 // ANDNP(0, x) -> x
34292 if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
34293 return N->getOperand(1);
34295 // ANDNP(x, 0) -> 0
34296 if (ISD::isBuildVectorAllZeros(N->getOperand(1).getNode()))
34297 return getZeroVector(N->getSimpleValueType(0), Subtarget, DAG, SDLoc(N));
34299 EVT VT = N->getValueType(0);
34301 // Attempt to recursively combine a bitmask ANDNP with shuffles.
34302 if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
34304 SmallVector<int, 1> NonceMask; // Just a placeholder.
34305 NonceMask.push_back(0);
34306 if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
34307 /*Depth*/ 1, /*HasVarMask*/ false, DAG,
34309 return SDValue(); // This routine will use CombineTo to replace N.
34315 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
34316 TargetLowering::DAGCombinerInfo &DCI) {
34317 // BT ignores high bits in the bit index operand.
34318 SDValue Op1 = N->getOperand(1);
34319 if (Op1.hasOneUse()) {
34320 unsigned BitWidth = Op1.getValueSizeInBits();
34321 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
34323 TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
34324 !DCI.isBeforeLegalizeOps());
34325 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34326 if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
34327 TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
34328 DCI.CommitTargetLoweringOpt(TLO);
34333 static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
34334 const X86Subtarget &Subtarget) {
34335 EVT VT = N->getValueType(0);
34336 if (!VT.isVector())
34339 SDValue N0 = N->getOperand(0);
34340 SDValue N1 = N->getOperand(1);
34341 EVT ExtraVT = cast<VTSDNode>(N1)->getVT();
34344 // The SIGN_EXTEND_INREG to v4i64 is expensive operation on the
34345 // both SSE and AVX2 since there is no sign-extended shift right
34346 // operation on a vector with 64-bit elements.
34347 //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
34348 // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
34349 if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
34350 N0.getOpcode() == ISD::SIGN_EXTEND)) {
34351 SDValue N00 = N0.getOperand(0);
34353 // EXTLOAD has a better solution on AVX2,
34354 // it may be replaced with X86ISD::VSEXT node.
34355 if (N00.getOpcode() == ISD::LOAD && Subtarget.hasInt256())
34356 if (!ISD::isNormalLoad(N00.getNode()))
34359 if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
34360 SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
34362 return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
34368 /// sext(add_nsw(x, C)) --> add(sext(x), C_sext)
34369 /// zext(add_nuw(x, C)) --> add(zext(x), C_zext)
34370 /// Promoting a sign/zero extension ahead of a no overflow 'add' exposes
34371 /// opportunities to combine math ops, use an LEA, or use a complex addressing
34372 /// mode. This can eliminate extend, add, and shift instructions.
34373 static SDValue promoteExtBeforeAdd(SDNode *Ext, SelectionDAG &DAG,
34374 const X86Subtarget &Subtarget) {
34375 if (Ext->getOpcode() != ISD::SIGN_EXTEND &&
34376 Ext->getOpcode() != ISD::ZERO_EXTEND)
34379 // TODO: This should be valid for other integer types.
34380 EVT VT = Ext->getValueType(0);
34381 if (VT != MVT::i64)
34384 SDValue Add = Ext->getOperand(0);
34385 if (Add.getOpcode() != ISD::ADD)
34388 bool Sext = Ext->getOpcode() == ISD::SIGN_EXTEND;
34389 bool NSW = Add->getFlags().hasNoSignedWrap();
34390 bool NUW = Add->getFlags().hasNoUnsignedWrap();
34392 // We need an 'add nsw' feeding into the 'sext' or 'add nuw' feeding
34394 if ((Sext && !NSW) || (!Sext && !NUW))
34397 // Having a constant operand to the 'add' ensures that we are not increasing
34398 // the instruction count because the constant is extended for free below.
34399 // A constant operand can also become the displacement field of an LEA.
34400 auto *AddOp1 = dyn_cast<ConstantSDNode>(Add.getOperand(1));
34404 // Don't make the 'add' bigger if there's no hope of combining it with some
34405 // other 'add' or 'shl' instruction.
34406 // TODO: It may be profitable to generate simpler LEA instructions in place
34407 // of single 'add' instructions, but the cost model for selecting an LEA
34408 // currently has a high threshold.
34409 bool HasLEAPotential = false;
34410 for (auto *User : Ext->uses()) {
34411 if (User->getOpcode() == ISD::ADD || User->getOpcode() == ISD::SHL) {
34412 HasLEAPotential = true;
34416 if (!HasLEAPotential)
34419 // Everything looks good, so pull the '{s|z}ext' ahead of the 'add'.
34420 int64_t AddConstant = Sext ? AddOp1->getSExtValue() : AddOp1->getZExtValue();
34421 SDValue AddOp0 = Add.getOperand(0);
34422 SDValue NewExt = DAG.getNode(Ext->getOpcode(), SDLoc(Ext), VT, AddOp0);
34423 SDValue NewConstant = DAG.getConstant(AddConstant, SDLoc(Add), VT);
34425 // The wider add is guaranteed to not wrap because both operands are
34428 Flags.setNoSignedWrap(NSW);
34429 Flags.setNoUnsignedWrap(NUW);
34430 return DAG.getNode(ISD::ADD, SDLoc(Add), VT, NewExt, NewConstant, Flags);
34433 /// (i8,i32 {s/z}ext ({s/u}divrem (i8 x, i8 y)) ->
34434 /// (i8,i32 ({s/u}divrem_sext_hreg (i8 x, i8 y)
34435 /// This exposes the {s/z}ext to the sdivrem lowering, so that it directly
34436 /// extends from AH (which we otherwise need to do contortions to access).
34437 static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
34438 SDValue N0 = N->getOperand(0);
34439 auto OpcodeN = N->getOpcode();
34440 auto OpcodeN0 = N0.getOpcode();
34441 if (!((OpcodeN == ISD::SIGN_EXTEND && OpcodeN0 == ISD::SDIVREM) ||
34442 (OpcodeN == ISD::ZERO_EXTEND && OpcodeN0 == ISD::UDIVREM)))
34445 EVT VT = N->getValueType(0);
34446 EVT InVT = N0.getValueType();
34447 if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
34450 SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
34451 auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
34452 : X86ISD::UDIVREM8_ZEXT_HREG;
34453 SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
34455 DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
34456 return R.getValue(1);
34459 /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
34460 /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
34461 /// with UNDEFs) of the input to vectors of the same size as the target type
34462 /// which then extends the lowest elements.
34463 static SDValue combineToExtendVectorInReg(SDNode *N, SelectionDAG &DAG,
34464 TargetLowering::DAGCombinerInfo &DCI,
34465 const X86Subtarget &Subtarget) {
34466 unsigned Opcode = N->getOpcode();
34467 if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND)
34469 if (!DCI.isBeforeLegalizeOps())
34471 if (!Subtarget.hasSSE2())
34474 SDValue N0 = N->getOperand(0);
34475 EVT VT = N->getValueType(0);
34476 EVT SVT = VT.getScalarType();
34477 EVT InVT = N0.getValueType();
34478 EVT InSVT = InVT.getScalarType();
34480 // Input type must be a vector and we must be extending legal integer types.
34481 if (!VT.isVector())
34483 if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16)
34485 if (InSVT != MVT::i32 && InSVT != MVT::i16 && InSVT != MVT::i8)
34488 // On AVX2+ targets, if the input/output types are both legal then we will be
34489 // able to use SIGN_EXTEND/ZERO_EXTEND directly.
34490 if (Subtarget.hasInt256() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
34491 DAG.getTargetLoweringInfo().isTypeLegal(InVT))
34496 auto ExtendVecSize = [&DAG](const SDLoc &DL, SDValue N, unsigned Size) {
34497 EVT InVT = N.getValueType();
34498 EVT OutVT = EVT::getVectorVT(*DAG.getContext(), InVT.getScalarType(),
34499 Size / InVT.getScalarSizeInBits());
34500 SmallVector<SDValue, 8> Opnds(Size / InVT.getSizeInBits(),
34501 DAG.getUNDEF(InVT));
34503 return DAG.getNode(ISD::CONCAT_VECTORS, DL, OutVT, Opnds);
34506 // If target-size is less than 128-bits, extend to a type that would extend
34507 // to 128 bits, extend that and extract the original target vector.
34508 if (VT.getSizeInBits() < 128 && !(128 % VT.getSizeInBits())) {
34509 unsigned Scale = 128 / VT.getSizeInBits();
34511 EVT::getVectorVT(*DAG.getContext(), SVT, 128 / SVT.getSizeInBits());
34512 SDValue Ex = ExtendVecSize(DL, N0, Scale * InVT.getSizeInBits());
34513 SDValue SExt = DAG.getNode(Opcode, DL, ExVT, Ex);
34514 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SExt,
34515 DAG.getIntPtrConstant(0, DL));
34518 // If target-size is 128-bits (or 256-bits on AVX2 target), then convert to
34519 // ISD::*_EXTEND_VECTOR_INREG which ensures lowering to X86ISD::V*EXT.
34520 // Also use this if we don't have SSE41 to allow the legalizer do its job.
34521 if (!Subtarget.hasSSE41() || VT.is128BitVector() ||
34522 (VT.is256BitVector() && Subtarget.hasInt256()) ||
34523 (VT.is512BitVector() && Subtarget.hasAVX512())) {
34524 SDValue ExOp = ExtendVecSize(DL, N0, VT.getSizeInBits());
34525 return Opcode == ISD::SIGN_EXTEND
34526 ? DAG.getSignExtendVectorInReg(ExOp, DL, VT)
34527 : DAG.getZeroExtendVectorInReg(ExOp, DL, VT);
34530 auto SplitAndExtendInReg = [&](unsigned SplitSize) {
34531 unsigned NumVecs = VT.getSizeInBits() / SplitSize;
34532 unsigned NumSubElts = SplitSize / SVT.getSizeInBits();
34533 EVT SubVT = EVT::getVectorVT(*DAG.getContext(), SVT, NumSubElts);
34534 EVT InSubVT = EVT::getVectorVT(*DAG.getContext(), InSVT, NumSubElts);
34536 SmallVector<SDValue, 8> Opnds;
34537 for (unsigned i = 0, Offset = 0; i != NumVecs; ++i, Offset += NumSubElts) {
34538 SDValue SrcVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InSubVT, N0,
34539 DAG.getIntPtrConstant(Offset, DL));
34540 SrcVec = ExtendVecSize(DL, SrcVec, SplitSize);
34541 SrcVec = Opcode == ISD::SIGN_EXTEND
34542 ? DAG.getSignExtendVectorInReg(SrcVec, DL, SubVT)
34543 : DAG.getZeroExtendVectorInReg(SrcVec, DL, SubVT);
34544 Opnds.push_back(SrcVec);
34546 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Opnds);
34549 // On pre-AVX2 targets, split into 128-bit nodes of
34550 // ISD::*_EXTEND_VECTOR_INREG.
34551 if (!Subtarget.hasInt256() && !(VT.getSizeInBits() % 128))
34552 return SplitAndExtendInReg(128);
34554 // On pre-AVX512 targets, split into 256-bit nodes of
34555 // ISD::*_EXTEND_VECTOR_INREG.
34556 if (!Subtarget.hasAVX512() && !(VT.getSizeInBits() % 256))
34557 return SplitAndExtendInReg(256);
34562 static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
34563 TargetLowering::DAGCombinerInfo &DCI,
34564 const X86Subtarget &Subtarget) {
34565 SDValue N0 = N->getOperand(0);
34566 EVT VT = N->getValueType(0);
34567 EVT InVT = N0.getValueType();
34570 if (SDValue DivRem8 = getDivRem8(N, DAG))
34573 if (!DCI.isBeforeLegalizeOps()) {
34574 if (InVT == MVT::i1) {
34575 SDValue Zero = DAG.getConstant(0, DL, VT);
34576 SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
34577 return DAG.getSelect(DL, VT, N0, AllOnes, Zero);
34582 if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
34583 isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
34584 // Invert and sign-extend a boolean is the same as zero-extend and subtract
34585 // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
34586 // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
34587 // sext (xor Bool, -1) --> sub (zext Bool), 1
34588 SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
34589 return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
34592 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34595 if (Subtarget.hasAVX() && VT.is256BitVector())
34596 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34599 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34605 static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
34606 const X86Subtarget &Subtarget) {
34608 EVT VT = N->getValueType(0);
34610 // Let legalize expand this if it isn't a legal type yet.
34611 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
34614 EVT ScalarVT = VT.getScalarType();
34615 if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
34618 SDValue A = N->getOperand(0);
34619 SDValue B = N->getOperand(1);
34620 SDValue C = N->getOperand(2);
34622 auto invertIfNegative = [](SDValue &V) {
34623 if (SDValue NegVal = isFNEG(V.getNode())) {
34630 // Do not convert the passthru input of scalar intrinsics.
34631 // FIXME: We could allow negations of the lower element only.
34632 bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
34633 bool NegB = invertIfNegative(B);
34634 bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
34636 // Negative multiplication when NegA xor NegB
34637 bool NegMul = (NegA != NegB);
34639 unsigned NewOpcode;
34641 NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
34643 NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
34646 if (N->getOpcode() == X86ISD::FMADD_RND) {
34647 switch (NewOpcode) {
34648 case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
34649 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
34650 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
34651 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
34653 } else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
34654 switch (NewOpcode) {
34655 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
34656 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
34657 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
34658 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
34660 } else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
34661 switch (NewOpcode) {
34662 case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
34663 case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
34664 case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
34665 case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
34668 assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
34669 "Unexpected opcode!");
34670 return DAG.getNode(NewOpcode, dl, VT, A, B, C);
34673 return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
34676 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
34677 TargetLowering::DAGCombinerInfo &DCI,
34678 const X86Subtarget &Subtarget) {
34679 // (i32 zext (and (i8 x86isd::setcc_carry), 1)) ->
34680 // (and (i32 x86isd::setcc_carry), 1)
34681 // This eliminates the zext. This transformation is necessary because
34682 // ISD::SETCC is always legalized to i8.
34684 SDValue N0 = N->getOperand(0);
34685 EVT VT = N->getValueType(0);
34687 if (N0.getOpcode() == ISD::AND &&
34689 N0.getOperand(0).hasOneUse()) {
34690 SDValue N00 = N0.getOperand(0);
34691 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34692 if (!isOneConstant(N0.getOperand(1)))
34694 return DAG.getNode(ISD::AND, dl, VT,
34695 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34696 N00.getOperand(0), N00.getOperand(1)),
34697 DAG.getConstant(1, dl, VT));
34701 if (N0.getOpcode() == ISD::TRUNCATE &&
34703 N0.getOperand(0).hasOneUse()) {
34704 SDValue N00 = N0.getOperand(0);
34705 if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
34706 return DAG.getNode(ISD::AND, dl, VT,
34707 DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
34708 N00.getOperand(0), N00.getOperand(1)),
34709 DAG.getConstant(1, dl, VT));
34713 if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
34716 if (VT.is256BitVector())
34717 if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
34720 if (SDValue DivRem8 = getDivRem8(N, DAG))
34723 if (SDValue NewAdd = promoteExtBeforeAdd(N, DAG, Subtarget))
34726 if (SDValue R = combineOrCmpEqZeroToCtlzSrl(N, DAG, DCI, Subtarget))
34732 /// Try to map a 128-bit or larger integer comparison to vector instructions
34733 /// before type legalization splits it up into chunks.
34734 static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
34735 const X86Subtarget &Subtarget) {
34736 ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
34737 assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
34739 // We're looking for an oversized integer equality comparison, but ignore a
34740 // comparison with zero because that gets special treatment in EmitTest().
34741 SDValue X = SetCC->getOperand(0);
34742 SDValue Y = SetCC->getOperand(1);
34743 EVT OpVT = X.getValueType();
34744 unsigned OpSize = OpVT.getSizeInBits();
34745 if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
34748 // Bail out if we know that this is not really just an oversized integer.
34749 if (peekThroughBitcasts(X).getValueType() == MVT::f128 ||
34750 peekThroughBitcasts(Y).getValueType() == MVT::f128)
34753 // TODO: Use PXOR + PTEST for SSE4.1 or later?
34754 // TODO: Add support for AVX-512.
34755 EVT VT = SetCC->getValueType(0);
34757 if ((OpSize == 128 && Subtarget.hasSSE2()) ||
34758 (OpSize == 256 && Subtarget.hasAVX2())) {
34759 EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
34760 SDValue VecX = DAG.getBitcast(VecVT, X);
34761 SDValue VecY = DAG.getBitcast(VecVT, Y);
34763 // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
34764 // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
34765 // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
34766 // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
34767 // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
34768 SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
34769 SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
34770 SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
34772 return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
34778 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
34779 const X86Subtarget &Subtarget) {
34780 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
34781 SDValue LHS = N->getOperand(0);
34782 SDValue RHS = N->getOperand(1);
34783 EVT VT = N->getValueType(0);
34786 if (CC == ISD::SETNE || CC == ISD::SETEQ) {
34787 EVT OpVT = LHS.getValueType();
34788 // 0-x == y --> x+y == 0
34789 // 0-x != y --> x+y != 0
34790 if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
34792 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
34793 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34795 // x == 0-y --> x+y == 0
34796 // x != 0-y --> x+y != 0
34797 if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
34799 SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
34800 return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
34803 if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
34807 if (VT.getScalarType() == MVT::i1 &&
34808 (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
34810 (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34811 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34812 bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34814 if (!IsSEXT0 || !IsVZero1) {
34815 // Swap the operands and update the condition code.
34816 std::swap(LHS, RHS);
34817 CC = ISD::getSetCCSwappedOperands(CC);
34819 IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
34820 (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
34821 IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
34824 if (IsSEXT0 && IsVZero1) {
34825 assert(VT == LHS.getOperand(0).getValueType() &&
34826 "Uexpected operand type");
34827 if (CC == ISD::SETGT)
34828 return DAG.getConstant(0, DL, VT);
34829 if (CC == ISD::SETLE)
34830 return DAG.getConstant(1, DL, VT);
34831 if (CC == ISD::SETEQ || CC == ISD::SETGE)
34832 return DAG.getNOT(DL, LHS.getOperand(0), VT);
34834 assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
34835 "Unexpected condition code!");
34836 return LHS.getOperand(0);
34840 // For an SSE1-only target, lower a comparison of v4f32 to X86ISD::CMPP early
34841 // to avoid scalarization via legalization because v4i32 is not a legal type.
34842 if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32 &&
34843 LHS.getValueType() == MVT::v4f32)
34844 return LowerVSETCC(SDValue(N, 0), Subtarget, DAG);
34849 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
34851 // Gather and Scatter instructions use k-registers for masks. The type of
34852 // the masks is v*i1. So the mask will be truncated anyway.
34853 // The SIGN_EXTEND_INREG my be dropped.
34854 SDValue Mask = N->getOperand(2);
34855 if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
34856 SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
34857 NewOps[2] = Mask.getOperand(0);
34858 DAG.UpdateNodeOperands(N, NewOps);
34863 // Optimize RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
34864 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
34865 const X86Subtarget &Subtarget) {
34867 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
34868 SDValue EFLAGS = N->getOperand(1);
34870 // Try to simplify the EFLAGS and condition code operands.
34871 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
34872 return getSETCC(CC, Flags, DL, DAG);
34877 /// Optimize branch condition evaluation.
34878 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
34879 const X86Subtarget &Subtarget) {
34881 SDValue EFLAGS = N->getOperand(3);
34882 X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(2));
34884 // Try to simplify the EFLAGS and condition code operands.
34885 // Make sure to not keep references to operands, as combineSetCCEFLAGS can
34886 // RAUW them under us.
34887 if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
34888 SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
34889 return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
34890 N->getOperand(1), Cond, Flags);
34896 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
34897 SelectionDAG &DAG) {
34898 // Take advantage of vector comparisons producing 0 or -1 in each lane to
34899 // optimize away operation when it's from a constant.
34901 // The general transformation is:
34902 // UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
34903 // AND(VECTOR_CMP(x,y), constant2)
34904 // constant2 = UNARYOP(constant)
34906 // Early exit if this isn't a vector operation, the operand of the
34907 // unary operation isn't a bitwise AND, or if the sizes of the operations
34908 // aren't the same.
34909 EVT VT = N->getValueType(0);
34910 if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
34911 N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
34912 VT.getSizeInBits() != N->getOperand(0)->getValueType(0).getSizeInBits())
34915 // Now check that the other operand of the AND is a constant. We could
34916 // make the transformation for non-constant splats as well, but it's unclear
34917 // that would be a benefit as it would not eliminate any operations, just
34918 // perform one more step in scalar code before moving to the vector unit.
34919 if (BuildVectorSDNode *BV =
34920 dyn_cast<BuildVectorSDNode>(N->getOperand(0)->getOperand(1))) {
34921 // Bail out if the vector isn't a constant.
34922 if (!BV->isConstant())
34925 // Everything checks out. Build up the new and improved node.
34927 EVT IntVT = BV->getValueType(0);
34928 // Create a new constant of the appropriate type for the transformed
34930 SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
34931 // The AND node needs bitcasts to/from an integer vector type around it.
34932 SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
34933 SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
34934 N->getOperand(0)->getOperand(0), MaskConst);
34935 SDValue Res = DAG.getBitcast(VT, NewAnd);
34942 static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
34943 const X86Subtarget &Subtarget) {
34944 SDValue Op0 = N->getOperand(0);
34945 EVT VT = N->getValueType(0);
34946 EVT InVT = Op0.getValueType();
34947 EVT InSVT = InVT.getScalarType();
34948 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
34950 // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
34951 // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
34952 if (InVT.isVector() && (InSVT == MVT::i8 || InSVT == MVT::i16)) {
34954 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34955 InVT.getVectorNumElements());
34956 SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
34958 if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
34959 return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
34961 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34964 // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
34965 // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
34966 // the optimization here.
34967 if (DAG.SignBitIsZero(Op0))
34968 return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
34973 static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
34974 const X86Subtarget &Subtarget) {
34975 // First try to optimize away the conversion entirely when it's
34976 // conditionally from a constant. Vectors only.
34977 if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
34980 // Now move on to more general possibilities.
34981 SDValue Op0 = N->getOperand(0);
34982 EVT VT = N->getValueType(0);
34983 EVT InVT = Op0.getValueType();
34984 EVT InSVT = InVT.getScalarType();
34986 // SINT_TO_FP(vXi1) -> SINT_TO_FP(SEXT(vXi1 to vXi32))
34987 // SINT_TO_FP(vXi8) -> SINT_TO_FP(SEXT(vXi8 to vXi32))
34988 // SINT_TO_FP(vXi16) -> SINT_TO_FP(SEXT(vXi16 to vXi32))
34989 if (InVT.isVector() &&
34990 (InSVT == MVT::i8 || InSVT == MVT::i16 ||
34991 (InSVT == MVT::i1 && !DAG.getTargetLoweringInfo().isTypeLegal(InVT)))) {
34993 EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
34994 InVT.getVectorNumElements());
34995 SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
34996 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
34999 // Without AVX512DQ we only support i64 to float scalar conversion. For both
35000 // vectors and scalars, see if we know that the upper bits are all the sign
35001 // bit, in which case we can truncate the input to i32 and convert from that.
35002 if (InVT.getScalarSizeInBits() > 32 && !Subtarget.hasDQI()) {
35003 unsigned BitWidth = InVT.getScalarSizeInBits();
35004 unsigned NumSignBits = DAG.ComputeNumSignBits(Op0);
35005 if (NumSignBits >= (BitWidth - 31)) {
35006 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), 32);
35007 if (InVT.isVector())
35008 TruncVT = EVT::getVectorVT(*DAG.getContext(), TruncVT,
35009 InVT.getVectorNumElements());
35011 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
35012 return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
35016 // Transform (SINT_TO_FP (i64 ...)) into an x87 operation if we have
35017 // a 32-bit target where SSE doesn't support i64->FP operations.
35018 if (!Subtarget.useSoftFloat() && Op0.getOpcode() == ISD::LOAD) {
35019 LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
35020 EVT LdVT = Ld->getValueType(0);
35022 // This transformation is not supported if the result type is f16 or f128.
35023 if (VT == MVT::f16 || VT == MVT::f128)
35026 if (!Ld->isVolatile() && !VT.isVector() &&
35027 ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
35028 !Subtarget.is64Bit() && LdVT == MVT::i64) {
35029 SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
35030 SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
35031 DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
35038 static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
35039 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
35040 MVT VT = N->getSimpleValueType(0);
35041 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35042 return DAG.getNode(X86ISD::SBB, SDLoc(N), VTs,
35043 N->getOperand(0), N->getOperand(1),
35050 // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
35051 static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
35052 X86TargetLowering::DAGCombinerInfo &DCI) {
35053 // If the LHS and RHS of the ADC node are zero, then it can't overflow and
35054 // the result is either zero or one (depending on the input carry bit).
35055 // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
35056 if (X86::isZeroNode(N->getOperand(0)) &&
35057 X86::isZeroNode(N->getOperand(1)) &&
35058 // We don't have a good way to replace an EFLAGS use, so only do this when
35060 SDValue(N, 1).use_empty()) {
35062 EVT VT = N->getValueType(0);
35063 SDValue CarryOut = DAG.getConstant(0, DL, N->getValueType(1));
35064 SDValue Res1 = DAG.getNode(ISD::AND, DL, VT,
35065 DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35066 DAG.getConstant(X86::COND_B, DL,
35069 DAG.getConstant(1, DL, VT));
35070 return DCI.CombineTo(N, Res1, CarryOut);
35073 if (SDValue Flags = combineCarryThroughADD(N->getOperand(2))) {
35074 MVT VT = N->getSimpleValueType(0);
35075 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35076 return DAG.getNode(X86ISD::ADC, SDLoc(N), VTs,
35077 N->getOperand(0), N->getOperand(1),
35084 /// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
35085 /// which is more useful than 0/1 in some cases.
35086 static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
35088 // "Condition code B" is also known as "the carry flag" (CF).
35089 SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
35090 SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
35091 MVT VT = N->getSimpleValueType(0);
35093 return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
35095 assert(VT == MVT::i1 && "Unexpected type for SETCC node");
35096 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
35099 /// If this is an add or subtract where one operand is produced by a cmp+setcc,
35100 /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
35101 /// with CMP+{ADC, SBB}.
35102 static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
35103 bool IsSub = N->getOpcode() == ISD::SUB;
35104 SDValue X = N->getOperand(0);
35105 SDValue Y = N->getOperand(1);
35107 // If this is an add, canonicalize a zext operand to the RHS.
35108 // TODO: Incomplete? What if both sides are zexts?
35109 if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
35110 Y.getOpcode() != ISD::ZERO_EXTEND)
35113 // Look through a one-use zext.
35114 bool PeekedThroughZext = false;
35115 if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
35116 Y = Y.getOperand(0);
35117 PeekedThroughZext = true;
35120 // If this is an add, canonicalize a setcc operand to the RHS.
35121 // TODO: Incomplete? What if both sides are setcc?
35122 // TODO: Should we allow peeking through a zext of the other operand?
35123 if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
35124 Y.getOpcode() != X86ISD::SETCC)
35127 if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
35131 EVT VT = N->getValueType(0);
35132 X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
35134 // If X is -1 or 0, then we have an opportunity to avoid constants required in
35135 // the general case below.
35136 auto *ConstantX = dyn_cast<ConstantSDNode>(X);
35138 if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) ||
35139 (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) {
35140 // This is a complicated way to get -1 or 0 from the carry flag:
35141 // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35142 // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax
35143 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35144 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35148 if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) ||
35149 (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) {
35150 SDValue EFLAGS = Y->getOperand(1);
35151 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35152 EFLAGS.getValueType().isInteger() &&
35153 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35154 // Swap the operands of a SUB, and we have the same pattern as above.
35155 // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB
35156 // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB
35157 SDValue NewSub = DAG.getNode(
35158 X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
35159 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35160 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35161 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35162 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35168 if (CC == X86::COND_B) {
35169 // X + SETB Z --> X + (mask SBB Z, Z)
35170 // X - SETB Z --> X - (mask SBB Z, Z)
35171 // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
35172 SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
35173 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35174 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35175 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35178 if (CC == X86::COND_A) {
35179 SDValue EFLAGS = Y->getOperand(1);
35180 // Try to convert COND_A into COND_B in an attempt to facilitate
35181 // materializing "setb reg".
35183 // Do not flip "e > c", where "c" is a constant, because Cmp instruction
35184 // cannot take an immediate as its first operand.
35186 if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
35187 EFLAGS.getValueType().isInteger() &&
35188 !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
35189 SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
35190 EFLAGS.getNode()->getVTList(),
35191 EFLAGS.getOperand(1), EFLAGS.getOperand(0));
35192 SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
35193 SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
35194 if (SBB.getValueSizeInBits() != VT.getSizeInBits())
35195 SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
35196 return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
35200 if (CC != X86::COND_E && CC != X86::COND_NE)
35203 SDValue Cmp = Y.getOperand(1);
35204 if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
35205 !X86::isZeroNode(Cmp.getOperand(1)) ||
35206 !Cmp.getOperand(0).getValueType().isInteger())
35209 SDValue Z = Cmp.getOperand(0);
35210 EVT ZVT = Z.getValueType();
35212 // If X is -1 or 0, then we have an opportunity to avoid constants required in
35213 // the general case below.
35215 // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with
35217 // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z)
35218 // -1 + (Z == 0) --> sbb %eax, %eax, (neg Z)
35219 if ((IsSub && CC == X86::COND_NE && ConstantX->isNullValue()) ||
35220 (!IsSub && CC == X86::COND_E && ConstantX->isAllOnesValue())) {
35221 SDValue Zero = DAG.getConstant(0, DL, ZVT);
35222 SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
35223 SDValue Neg = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, Z);
35224 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35225 DAG.getConstant(X86::COND_B, DL, MVT::i8),
35226 SDValue(Neg.getNode(), 1));
35229 // cmp with 1 sets the carry flag when Z == 0, so create 0 or -1 using 'sbb'
35230 // with fake operands:
35231 // 0 - (Z == 0) --> sbb %eax, %eax, (cmp Z, 1)
35232 // -1 + (Z != 0) --> sbb %eax, %eax, (cmp Z, 1)
35233 if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
35234 (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
35235 SDValue One = DAG.getConstant(1, DL, ZVT);
35236 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35237 return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
35238 DAG.getConstant(X86::COND_B, DL, MVT::i8), Cmp1);
35242 // (cmp Z, 1) sets the carry flag if Z is 0.
35243 SDValue One = DAG.getConstant(1, DL, ZVT);
35244 SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
35246 // Add the flags type for ADC/SBB nodes.
35247 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
35249 // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
35250 // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
35251 if (CC == X86::COND_NE)
35252 return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
35253 DAG.getConstant(-1ULL, DL, VT), Cmp1);
35255 // X - (Z == 0) --> sub X, (zext(sete Z, 0)) --> sbb X, 0, (cmp Z, 1)
35256 // X + (Z == 0) --> add X, (zext(sete Z, 0)) --> adc X, 0, (cmp Z, 1)
35257 return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
35258 DAG.getConstant(0, DL, VT), Cmp1);
35261 static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
35262 const X86Subtarget &Subtarget) {
35263 SDValue MulOp = N->getOperand(0);
35264 SDValue Phi = N->getOperand(1);
35266 if (MulOp.getOpcode() != ISD::MUL)
35267 std::swap(MulOp, Phi);
35268 if (MulOp.getOpcode() != ISD::MUL)
35272 if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode) || Mode == MULU16)
35275 EVT VT = N->getValueType(0);
35277 unsigned RegSize = 128;
35278 if (Subtarget.hasBWI())
35280 else if (Subtarget.hasAVX2())
35282 unsigned VectorSize = VT.getVectorNumElements() * 16;
35283 // If the vector size is less than 128, or greater than the supported RegSize,
35284 // do not use PMADD.
35285 if (VectorSize < 128 || VectorSize > RegSize)
35289 EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
35290 VT.getVectorNumElements());
35291 EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
35292 VT.getVectorNumElements() / 2);
35294 // Shrink the operands of mul.
35295 SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
35296 SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
35298 // Madd vector size is half of the original vector size
35299 SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
35300 // Fill the rest of the output with 0
35301 SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
35302 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
35303 return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
35306 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
35307 const X86Subtarget &Subtarget) {
35309 EVT VT = N->getValueType(0);
35310 SDValue Op0 = N->getOperand(0);
35311 SDValue Op1 = N->getOperand(1);
35313 // TODO: There's nothing special about i32, any integer type above i16 should
35314 // work just as well.
35315 if (!VT.isVector() || !VT.isSimple() ||
35316 !(VT.getVectorElementType() == MVT::i32))
35319 unsigned RegSize = 128;
35320 if (Subtarget.hasBWI())
35322 else if (Subtarget.hasAVX2())
35325 // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
35326 // TODO: We should be able to handle larger vectors by splitting them before
35327 // feeding them into several SADs, and then reducing over those.
35328 if (VT.getSizeInBits() / 4 > RegSize)
35331 // We know N is a reduction add, which means one of its operands is a phi.
35332 // To match SAD, we need the other operand to be a vector select.
35333 SDValue SelectOp, Phi;
35334 if (Op0.getOpcode() == ISD::VSELECT) {
35337 } else if (Op1.getOpcode() == ISD::VSELECT) {
35343 // Check whether we have an abs-diff pattern feeding into the select.
35344 if(!detectZextAbsDiff(SelectOp, Op0, Op1))
35347 // SAD pattern detected. Now build a SAD instruction and an addition for
35348 // reduction. Note that the number of elements of the result of SAD is less
35349 // than the number of elements of its input. Therefore, we could only update
35350 // part of elements in the reduction vector.
35351 SDValue Sad = createPSADBW(DAG, Op0, Op1, DL);
35353 // The output of PSADBW is a vector of i64.
35354 // We need to turn the vector of i64 into a vector of i32.
35355 // If the reduction vector is at least as wide as the psadbw result, just
35356 // bitcast. If it's narrower, truncate - the high i32 of each i64 is zero
35358 MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
35359 if (VT.getSizeInBits() >= ResVT.getSizeInBits())
35360 Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
35362 Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
35364 if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
35365 // Update part of elements of the reduction vector. This is done by first
35366 // extracting a sub-vector from it, updating this sub-vector, and inserting
35368 SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
35369 DAG.getIntPtrConstant(0, DL));
35370 SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
35371 return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
35372 DAG.getIntPtrConstant(0, DL));
35374 return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
35377 /// Convert vector increment or decrement to sub/add with an all-ones constant:
35378 /// add X, <1, 1...> --> sub X, <-1, -1...>
35379 /// sub X, <1, 1...> --> add X, <-1, -1...>
35380 /// The all-ones vector constant can be materialized using a pcmpeq instruction
35381 /// that is commonly recognized as an idiom (has no register dependency), so
35382 /// that's better/smaller than loading a splat 1 constant.
35383 static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
35384 assert((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
35385 "Unexpected opcode for increment/decrement transform");
35387 // Pseudo-legality check: getOnesVector() expects one of these types, so bail
35388 // out and wait for legalization if we have an unsupported vector length.
35389 EVT VT = N->getValueType(0);
35390 if (!VT.is128BitVector() && !VT.is256BitVector() && !VT.is512BitVector())
35393 SDNode *N1 = N->getOperand(1).getNode();
35395 if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) ||
35396 !SplatVal.isOneValue())
35399 SDValue AllOnesVec = getOnesVector(VT, DAG, SDLoc(N));
35400 unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
35401 return DAG.getNode(NewOpcode, SDLoc(N), VT, N->getOperand(0), AllOnesVec);
35404 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
35405 const X86Subtarget &Subtarget) {
35406 const SDNodeFlags Flags = N->getFlags();
35407 if (Flags.hasVectorReduction()) {
35408 if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
35410 if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
35413 EVT VT = N->getValueType(0);
35414 SDValue Op0 = N->getOperand(0);
35415 SDValue Op1 = N->getOperand(1);
35417 // Try to synthesize horizontal adds from adds of shuffles.
35418 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35419 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35420 isHorizontalBinOp(Op0, Op1, true))
35421 return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
35423 if (SDValue V = combineIncDecVector(N, DAG))
35426 return combineAddOrSubToADCOrSBB(N, DAG);
35429 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
35430 const X86Subtarget &Subtarget) {
35431 SDValue Op0 = N->getOperand(0);
35432 SDValue Op1 = N->getOperand(1);
35434 // X86 can't encode an immediate LHS of a sub. See if we can push the
35435 // negation into a preceding instruction.
35436 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op0)) {
35437 // If the RHS of the sub is a XOR with one use and a constant, invert the
35438 // immediate. Then add one to the LHS of the sub so we can turn
35439 // X-Y -> X+~Y+1, saving one register.
35440 if (Op1->hasOneUse() && Op1.getOpcode() == ISD::XOR &&
35441 isa<ConstantSDNode>(Op1.getOperand(1))) {
35442 APInt XorC = cast<ConstantSDNode>(Op1.getOperand(1))->getAPIntValue();
35443 EVT VT = Op0.getValueType();
35444 SDValue NewXor = DAG.getNode(ISD::XOR, SDLoc(Op1), VT,
35446 DAG.getConstant(~XorC, SDLoc(Op1), VT));
35447 return DAG.getNode(ISD::ADD, SDLoc(N), VT, NewXor,
35448 DAG.getConstant(C->getAPIntValue() + 1, SDLoc(N), VT));
35452 // Try to synthesize horizontal subs from subs of shuffles.
35453 EVT VT = N->getValueType(0);
35454 if (((Subtarget.hasSSSE3() && (VT == MVT::v8i16 || VT == MVT::v4i32)) ||
35455 (Subtarget.hasInt256() && (VT == MVT::v16i16 || VT == MVT::v8i32))) &&
35456 isHorizontalBinOp(Op0, Op1, false))
35457 return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
35459 if (SDValue V = combineIncDecVector(N, DAG))
35462 return combineAddOrSubToADCOrSBB(N, DAG);
35465 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
35466 TargetLowering::DAGCombinerInfo &DCI,
35467 const X86Subtarget &Subtarget) {
35468 if (DCI.isBeforeLegalize())
35472 unsigned Opcode = N->getOpcode();
35473 MVT VT = N->getSimpleValueType(0);
35474 MVT SVT = VT.getVectorElementType();
35475 unsigned NumElts = VT.getVectorNumElements();
35476 unsigned EltSizeInBits = SVT.getSizeInBits();
35478 SDValue Op = N->getOperand(0);
35479 MVT OpVT = Op.getSimpleValueType();
35480 MVT OpEltVT = OpVT.getVectorElementType();
35481 unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
35482 unsigned InputBits = OpEltSizeInBits * NumElts;
35484 // Perform any constant folding.
35485 // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
35487 SmallVector<APInt, 64> EltBits;
35488 if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
35489 APInt Undefs(NumElts, 0);
35490 SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
35492 (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
35493 for (unsigned i = 0; i != NumElts; ++i) {
35494 if (UndefElts[i]) {
35498 Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
35499 : EltBits[i].sextOrTrunc(EltSizeInBits);
35501 return getConstVector(Vals, Undefs, VT, DAG, DL);
35504 // (vzext (bitcast (vzext (x)) -> (vzext x)
35505 // TODO: (vsext (bitcast (vsext (x)) -> (vsext x)
35506 SDValue V = peekThroughBitcasts(Op);
35507 if (Opcode == X86ISD::VZEXT && V != Op && V.getOpcode() == X86ISD::VZEXT) {
35508 MVT InnerVT = V.getSimpleValueType();
35509 MVT InnerEltVT = InnerVT.getVectorElementType();
35511 // If the element sizes match exactly, we can just do one larger vzext. This
35512 // is always an exact type match as vzext operates on integer types.
35513 if (OpEltVT == InnerEltVT) {
35514 assert(OpVT == InnerVT && "Types must match for vzext!");
35515 return DAG.getNode(X86ISD::VZEXT, DL, VT, V.getOperand(0));
35518 // The only other way we can combine them is if only a single element of the
35519 // inner vzext is used in the input to the outer vzext.
35520 if (InnerEltVT.getSizeInBits() < InputBits)
35523 // In this case, the inner vzext is completely dead because we're going to
35524 // only look at bits inside of the low element. Just do the outer vzext on
35525 // a bitcast of the input to the inner.
35526 return DAG.getNode(X86ISD::VZEXT, DL, VT, DAG.getBitcast(OpVT, V));
35529 // Check if we can bypass extracting and re-inserting an element of an input
35530 // vector. Essentially:
35531 // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
35532 // TODO: Add X86ISD::VSEXT support
35533 if (Opcode == X86ISD::VZEXT &&
35534 V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
35535 V.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
35536 V.getOperand(0).getSimpleValueType().getSizeInBits() == InputBits) {
35537 SDValue ExtractedV = V.getOperand(0);
35538 SDValue OrigV = ExtractedV.getOperand(0);
35539 if (isNullConstant(ExtractedV.getOperand(1))) {
35540 MVT OrigVT = OrigV.getSimpleValueType();
35541 // Extract a subvector if necessary...
35542 if (OrigVT.getSizeInBits() > OpVT.getSizeInBits()) {
35543 int Ratio = OrigVT.getSizeInBits() / OpVT.getSizeInBits();
35544 OrigVT = MVT::getVectorVT(OrigVT.getVectorElementType(),
35545 OrigVT.getVectorNumElements() / Ratio);
35546 OrigV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigVT, OrigV,
35547 DAG.getIntPtrConstant(0, DL));
35549 Op = DAG.getBitcast(OpVT, OrigV);
35550 return DAG.getNode(X86ISD::VZEXT, DL, VT, Op);
35557 /// Canonicalize (LSUB p, 1) -> (LADD p, -1).
35558 static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
35559 const X86Subtarget &Subtarget) {
35560 SDValue Chain = N->getOperand(0);
35561 SDValue LHS = N->getOperand(1);
35562 SDValue RHS = N->getOperand(2);
35563 MVT VT = RHS.getSimpleValueType();
35566 auto *C = dyn_cast<ConstantSDNode>(RHS);
35567 if (!C || C->getZExtValue() != 1)
35570 RHS = DAG.getConstant(-1, DL, VT);
35571 MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
35572 return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
35573 DAG.getVTList(MVT::i32, MVT::Other),
35574 {Chain, LHS, RHS}, VT, MMO);
35577 // TEST (AND a, b) ,(AND a, b) -> TEST a, b
35578 static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
35579 SDValue Op0 = N->getOperand(0);
35580 SDValue Op1 = N->getOperand(1);
35582 if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
35585 EVT VT = N->getValueType(0);
35588 return DAG.getNode(X86ISD::TESTM, DL, VT,
35589 Op0->getOperand(0), Op0->getOperand(1));
35592 static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
35593 const X86Subtarget &Subtarget) {
35594 MVT VT = N->getSimpleValueType(0);
35597 if (N->getOperand(0) == N->getOperand(1)) {
35598 if (N->getOpcode() == X86ISD::PCMPEQ)
35599 return getOnesVector(VT, DAG, DL);
35600 if (N->getOpcode() == X86ISD::PCMPGT)
35601 return getZeroVector(VT, Subtarget, DAG, DL);
35607 static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
35608 TargetLowering::DAGCombinerInfo &DCI,
35609 const X86Subtarget &Subtarget) {
35610 if (DCI.isBeforeLegalizeOps())
35614 SDValue Vec = N->getOperand(0);
35615 SDValue SubVec = N->getOperand(1);
35616 SDValue Idx = N->getOperand(2);
35618 unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
35619 MVT OpVT = N->getSimpleValueType(0);
35620 MVT SubVecVT = SubVec.getSimpleValueType();
35622 // If this is an insert of an extract, combine to a shuffle. Don't do this
35623 // if the insert or extract can be represented with a subvector operation.
35624 if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
35625 SubVec.getOperand(0).getSimpleValueType() == OpVT &&
35626 (IdxVal != 0 || !Vec.isUndef())) {
35627 int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
35628 if (ExtIdxVal != 0) {
35629 int VecNumElts = OpVT.getVectorNumElements();
35630 int SubVecNumElts = SubVecVT.getVectorNumElements();
35631 SmallVector<int, 64> Mask(VecNumElts);
35632 // First create an identity shuffle mask.
35633 for (int i = 0; i != VecNumElts; ++i)
35635 // Now insert the extracted portion.
35636 for (int i = 0; i != SubVecNumElts; ++i)
35637 Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
35639 return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
35643 // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
35645 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35646 // (load16 addr + 16), Elts/2)
35649 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35650 // (load32 addr + 32), Elts/2)
35652 // or a 16-byte or 32-byte broadcast:
35653 // (insert_subvector (insert_subvector undef, (load16 addr), 0),
35654 // (load16 addr), Elts/2)
35655 // --> X86SubVBroadcast(load16 addr)
35657 // (insert_subvector (insert_subvector undef, (load32 addr), 0),
35658 // (load32 addr), Elts/2)
35659 // --> X86SubVBroadcast(load32 addr)
35660 if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
35661 Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
35662 OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
35663 auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
35664 if (Idx2 && Idx2->getZExtValue() == 0) {
35665 SDValue SubVec2 = Vec.getOperand(1);
35666 // If needed, look through bitcasts to get to the load.
35667 if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
35669 unsigned Alignment = FirstLd->getAlignment();
35670 unsigned AS = FirstLd->getAddressSpace();
35671 const X86TargetLowering *TLI = Subtarget.getTargetLowering();
35672 if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
35673 OpVT, AS, Alignment, &Fast) && Fast) {
35674 SDValue Ops[] = {SubVec2, SubVec};
35675 if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG,
35680 // If lower/upper loads are the same and the only users of the load, then
35681 // lower to a VBROADCASTF128/VBROADCASTI128/etc.
35682 if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
35683 if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
35684 SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
35685 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
35688 // If this is subv_broadcast insert into both halves, use a larger
35690 if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
35691 return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
35692 SubVec.getOperand(0));
35701 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
35702 DAGCombinerInfo &DCI) const {
35703 SelectionDAG &DAG = DCI.DAG;
35704 switch (N->getOpcode()) {
35706 case ISD::EXTRACT_VECTOR_ELT:
35707 return combineExtractVectorElt(N, DAG, DCI, Subtarget);
35708 case X86ISD::PEXTRW:
35709 case X86ISD::PEXTRB:
35710 return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
35711 case ISD::INSERT_SUBVECTOR:
35712 return combineInsertSubvector(N, DAG, DCI, Subtarget);
35715 case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
35716 case ISD::BITCAST: return combineBitcast(N, DAG, DCI, Subtarget);
35717 case X86ISD::CMOV: return combineCMov(N, DAG, DCI, Subtarget);
35718 case ISD::ADD: return combineAdd(N, DAG, Subtarget);
35719 case ISD::SUB: return combineSub(N, DAG, Subtarget);
35720 case X86ISD::SBB: return combineSBB(N, DAG);
35721 case X86ISD::ADC: return combineADC(N, DAG, DCI);
35722 case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
35725 case ISD::SRL: return combineShift(N, DAG, DCI, Subtarget);
35726 case ISD::AND: return combineAnd(N, DAG, DCI, Subtarget);
35727 case ISD::OR: return combineOr(N, DAG, DCI, Subtarget);
35728 case ISD::XOR: return combineXor(N, DAG, DCI, Subtarget);
35729 case ISD::LOAD: return combineLoad(N, DAG, DCI, Subtarget);
35730 case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
35731 case ISD::STORE: return combineStore(N, DAG, Subtarget);
35732 case ISD::MSTORE: return combineMaskedStore(N, DAG, Subtarget);
35733 case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, Subtarget);
35734 case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
35736 case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
35737 case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
35738 case ISD::TRUNCATE: return combineTruncate(N, DAG, Subtarget);
35739 case X86ISD::ANDNP: return combineAndnp(N, DAG, DCI, Subtarget);
35740 case X86ISD::FAND: return combineFAnd(N, DAG, Subtarget);
35741 case X86ISD::FANDN: return combineFAndn(N, DAG, Subtarget);
35743 case X86ISD::FOR: return combineFOr(N, DAG, Subtarget);
35745 case X86ISD::FMAX: return combineFMinFMax(N, DAG);
35747 case ISD::FMAXNUM: return combineFMinNumFMaxNum(N, DAG, Subtarget);
35748 case X86ISD::BT: return combineBT(N, DAG, DCI);
35749 case ISD::ANY_EXTEND:
35750 case ISD::ZERO_EXTEND: return combineZext(N, DAG, DCI, Subtarget);
35751 case ISD::SIGN_EXTEND: return combineSext(N, DAG, DCI, Subtarget);
35752 case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
35753 case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
35754 case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
35755 case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
35756 case X86ISD::VSHLI:
35757 case X86ISD::VSRAI:
35758 case X86ISD::VSRLI:
35759 return combineVectorShiftImm(N, DAG, DCI, Subtarget);
35760 case ISD::SIGN_EXTEND_VECTOR_INREG:
35761 case ISD::ZERO_EXTEND_VECTOR_INREG:
35762 case X86ISD::VSEXT:
35763 case X86ISD::VZEXT: return combineVSZext(N, DAG, DCI, Subtarget);
35764 case X86ISD::PINSRB:
35765 case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget);
35766 case X86ISD::SHUFP: // Handle all target specific shuffles
35767 case X86ISD::INSERTPS:
35768 case X86ISD::EXTRQI:
35769 case X86ISD::INSERTQI:
35770 case X86ISD::PALIGNR:
35771 case X86ISD::VSHLDQ:
35772 case X86ISD::VSRLDQ:
35773 case X86ISD::BLENDI:
35774 case X86ISD::UNPCKH:
35775 case X86ISD::UNPCKL:
35776 case X86ISD::MOVHLPS:
35777 case X86ISD::MOVLHPS:
35778 case X86ISD::PSHUFB:
35779 case X86ISD::PSHUFD:
35780 case X86ISD::PSHUFHW:
35781 case X86ISD::PSHUFLW:
35782 case X86ISD::MOVSHDUP:
35783 case X86ISD::MOVSLDUP:
35784 case X86ISD::MOVDDUP:
35785 case X86ISD::MOVSS:
35786 case X86ISD::MOVSD:
35787 case X86ISD::VPPERM:
35788 case X86ISD::VPERMI:
35789 case X86ISD::VPERMV:
35790 case X86ISD::VPERMV3:
35791 case X86ISD::VPERMIV3:
35792 case X86ISD::VPERMIL2:
35793 case X86ISD::VPERMILPI:
35794 case X86ISD::VPERMILPV:
35795 case X86ISD::VPERM2X128:
35796 case X86ISD::VZEXT_MOVL:
35797 case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
35798 case X86ISD::FMADD:
35799 case X86ISD::FMADD_RND:
35800 case X86ISD::FMADDS1_RND:
35801 case X86ISD::FMADDS3_RND:
35802 case ISD::FMA: return combineFMA(N, DAG, Subtarget);
35804 case ISD::MSCATTER: return combineGatherScatter(N, DAG);
35805 case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
35806 case X86ISD::TESTM: return combineTestM(N, DAG);
35807 case X86ISD::PCMPEQ:
35808 case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
35814 /// Return true if the target has native support for the specified value type
35815 /// and it is 'desirable' to use the type for the given node type. e.g. On x86
35816 /// i16 is legal, but undesirable since i16 instruction encodings are longer and
35817 /// some i16 instructions are slow.
35818 bool X86TargetLowering::isTypeDesirableForOp(unsigned Opc, EVT VT) const {
35819 if (!isTypeLegal(VT))
35821 if (VT != MVT::i16)
35828 case ISD::SIGN_EXTEND:
35829 case ISD::ZERO_EXTEND:
35830 case ISD::ANY_EXTEND:
35843 /// This function checks if any of the users of EFLAGS copies the EFLAGS. We
35844 /// know that the code that lowers COPY of EFLAGS has to use the stack, and if
35845 /// we don't adjust the stack we clobber the first frame index.
35846 /// See X86InstrInfo::copyPhysReg.
35847 static bool hasCopyImplyingStackAdjustment(const MachineFunction &MF) {
35848 const MachineRegisterInfo &MRI = MF.getRegInfo();
35849 return any_of(MRI.reg_instructions(X86::EFLAGS),
35850 [](const MachineInstr &RI) { return RI.isCopy(); });
35853 void X86TargetLowering::finalizeLowering(MachineFunction &MF) const {
35854 if (hasCopyImplyingStackAdjustment(MF)) {
35855 MachineFrameInfo &MFI = MF.getFrameInfo();
35856 MFI.setHasCopyImplyingStackAdjustment(true);
35859 TargetLoweringBase::finalizeLowering(MF);
35862 /// This method query the target whether it is beneficial for dag combiner to
35863 /// promote the specified node. If true, it should return the desired promotion
35864 /// type by reference.
35865 bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
35866 EVT VT = Op.getValueType();
35867 if (VT != MVT::i16)
35870 bool Promote = false;
35871 bool Commute = false;
35872 switch (Op.getOpcode()) {
35874 case ISD::SIGN_EXTEND:
35875 case ISD::ZERO_EXTEND:
35876 case ISD::ANY_EXTEND:
35881 SDValue N0 = Op.getOperand(0);
35882 // Look out for (store (shl (load), x)).
35883 if (MayFoldLoad(N0) && MayFoldIntoStore(Op))
35896 SDValue N0 = Op.getOperand(0);
35897 SDValue N1 = Op.getOperand(1);
35898 if (!Commute && MayFoldLoad(N1))
35900 // Avoid disabling potential load folding opportunities.
35901 if (MayFoldLoad(N0) && (!isa<ConstantSDNode>(N1) || MayFoldIntoStore(Op)))
35903 if (MayFoldLoad(N1) && (!isa<ConstantSDNode>(N0) || MayFoldIntoStore(Op)))
35913 //===----------------------------------------------------------------------===//
35914 // X86 Inline Assembly Support
35915 //===----------------------------------------------------------------------===//
35917 // Helper to match a string separated by whitespace.
35918 static bool matchAsm(StringRef S, ArrayRef<const char *> Pieces) {
35919 S = S.substr(S.find_first_not_of(" \t")); // Skip leading whitespace.
35921 for (StringRef Piece : Pieces) {
35922 if (!S.startswith(Piece)) // Check if the piece matches.
35925 S = S.substr(Piece.size());
35926 StringRef::size_type Pos = S.find_first_not_of(" \t");
35927 if (Pos == 0) // We matched a prefix.
35936 static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
35938 if (AsmPieces.size() == 3 || AsmPieces.size() == 4) {
35939 if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{cc}") &&
35940 std::count(AsmPieces.begin(), AsmPieces.end(), "~{flags}") &&
35941 std::count(AsmPieces.begin(), AsmPieces.end(), "~{fpsr}")) {
35943 if (AsmPieces.size() == 3)
35945 else if (std::count(AsmPieces.begin(), AsmPieces.end(), "~{dirflag}"))
35952 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
35953 InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
35955 const std::string &AsmStr = IA->getAsmString();
35957 IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
35958 if (!Ty || Ty->getBitWidth() % 16 != 0)
35961 // TODO: should remove alternatives from the asmstring: "foo {a|b}" -> "foo a"
35962 SmallVector<StringRef, 4> AsmPieces;
35963 SplitString(AsmStr, AsmPieces, ";\n");
35965 switch (AsmPieces.size()) {
35966 default: return false;
35968 // FIXME: this should verify that we are targeting a 486 or better. If not,
35969 // we will turn this bswap into something that will be lowered to logical
35970 // ops instead of emitting the bswap asm. For now, we don't support 486 or
35971 // lower so don't worry about this.
35973 if (matchAsm(AsmPieces[0], {"bswap", "$0"}) ||
35974 matchAsm(AsmPieces[0], {"bswapl", "$0"}) ||
35975 matchAsm(AsmPieces[0], {"bswapq", "$0"}) ||
35976 matchAsm(AsmPieces[0], {"bswap", "${0:q}"}) ||
35977 matchAsm(AsmPieces[0], {"bswapl", "${0:q}"}) ||
35978 matchAsm(AsmPieces[0], {"bswapq", "${0:q}"})) {
35979 // No need to check constraints, nothing other than the equivalent of
35980 // "=r,0" would be valid here.
35981 return IntrinsicLowering::LowerToByteSwap(CI);
35984 // rorw $$8, ${0:w} --> llvm.bswap.i16
35985 if (CI->getType()->isIntegerTy(16) &&
35986 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
35987 (matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) ||
35988 matchAsm(AsmPieces[0], {"rolw", "$$8,", "${0:w}"}))) {
35990 StringRef ConstraintsStr = IA->getConstraintString();
35991 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
35992 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
35993 if (clobbersFlagRegisters(AsmPieces))
35994 return IntrinsicLowering::LowerToByteSwap(CI);
35998 if (CI->getType()->isIntegerTy(32) &&
35999 IA->getConstraintString().compare(0, 5, "=r,0,") == 0 &&
36000 matchAsm(AsmPieces[0], {"rorw", "$$8,", "${0:w}"}) &&
36001 matchAsm(AsmPieces[1], {"rorl", "$$16,", "$0"}) &&
36002 matchAsm(AsmPieces[2], {"rorw", "$$8,", "${0:w}"})) {
36004 StringRef ConstraintsStr = IA->getConstraintString();
36005 SplitString(StringRef(ConstraintsStr).substr(5), AsmPieces, ",");
36006 array_pod_sort(AsmPieces.begin(), AsmPieces.end());
36007 if (clobbersFlagRegisters(AsmPieces))
36008 return IntrinsicLowering::LowerToByteSwap(CI);
36011 if (CI->getType()->isIntegerTy(64)) {
36012 InlineAsm::ConstraintInfoVector Constraints = IA->ParseConstraints();
36013 if (Constraints.size() >= 2 &&
36014 Constraints[0].Codes.size() == 1 && Constraints[0].Codes[0] == "A" &&
36015 Constraints[1].Codes.size() == 1 && Constraints[1].Codes[0] == "0") {
36016 // bswap %eax / bswap %edx / xchgl %eax, %edx -> llvm.bswap.i64
36017 if (matchAsm(AsmPieces[0], {"bswap", "%eax"}) &&
36018 matchAsm(AsmPieces[1], {"bswap", "%edx"}) &&
36019 matchAsm(AsmPieces[2], {"xchgl", "%eax,", "%edx"}))
36020 return IntrinsicLowering::LowerToByteSwap(CI);
36028 /// Given a constraint letter, return the type of constraint for this target.
36029 X86TargetLowering::ConstraintType
36030 X86TargetLowering::getConstraintType(StringRef Constraint) const {
36031 if (Constraint.size() == 1) {
36032 switch (Constraint[0]) {
36044 return C_RegisterClass;
36045 case 'k': // AVX512 masking registers.
36069 else if (Constraint.size() == 2) {
36070 switch (Constraint[0]) {
36074 switch (Constraint[1]) {
36082 return TargetLowering::getConstraintType(Constraint);
36085 /// Examine constraint type and operand type and determine a weight value.
36086 /// This object must already have been set up with the operand type
36087 /// and the current alternative constraint selected.
36088 TargetLowering::ConstraintWeight
36089 X86TargetLowering::getSingleConstraintMatchWeight(
36090 AsmOperandInfo &info, const char *constraint) const {
36091 ConstraintWeight weight = CW_Invalid;
36092 Value *CallOperandVal = info.CallOperandVal;
36093 // If we don't have a value, we can't do a match,
36094 // but allow it at the lowest weight.
36095 if (!CallOperandVal)
36097 Type *type = CallOperandVal->getType();
36098 // Look at the constraint type.
36099 switch (*constraint) {
36101 weight = TargetLowering::getSingleConstraintMatchWeight(info, constraint);
36113 if (CallOperandVal->getType()->isIntegerTy())
36114 weight = CW_SpecificReg;
36119 if (type->isFloatingPointTy())
36120 weight = CW_SpecificReg;
36123 if (type->isX86_MMXTy() && Subtarget.hasMMX())
36124 weight = CW_SpecificReg;
36127 // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
36128 if (constraint[1] == 'k') {
36129 // Support for 'Yk' (similarly to the 'k' variant below).
36130 weight = CW_SpecificReg;
36133 // Else fall through (handle "Y" constraint).
36136 if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
36137 weight = CW_Register;
36140 if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
36141 ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasFp256()))
36142 weight = CW_Register;
36145 // Enable conditional vector operations using %k<#> registers.
36146 weight = CW_SpecificReg;
36149 if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
36150 if (C->getZExtValue() <= 31)
36151 weight = CW_Constant;
36155 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36156 if (C->getZExtValue() <= 63)
36157 weight = CW_Constant;
36161 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36162 if ((C->getSExtValue() >= -0x80) && (C->getSExtValue() <= 0x7f))
36163 weight = CW_Constant;
36167 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36168 if ((C->getZExtValue() == 0xff) || (C->getZExtValue() == 0xffff))
36169 weight = CW_Constant;
36173 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36174 if (C->getZExtValue() <= 3)
36175 weight = CW_Constant;
36179 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36180 if (C->getZExtValue() <= 0xff)
36181 weight = CW_Constant;
36186 if (isa<ConstantFP>(CallOperandVal)) {
36187 weight = CW_Constant;
36191 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36192 if ((C->getSExtValue() >= -0x80000000LL) &&
36193 (C->getSExtValue() <= 0x7fffffffLL))
36194 weight = CW_Constant;
36198 if (ConstantInt *C = dyn_cast<ConstantInt>(CallOperandVal)) {
36199 if (C->getZExtValue() <= 0xffffffff)
36200 weight = CW_Constant;
36207 /// Try to replace an X constraint, which matches anything, with another that
36208 /// has more specific requirements based on the type of the corresponding
36210 const char *X86TargetLowering::
36211 LowerXConstraint(EVT ConstraintVT) const {
36212 // FP X constraints get lowered to SSE1/2 registers if available, otherwise
36213 // 'f' like normal targets.
36214 if (ConstraintVT.isFloatingPoint()) {
36215 if (Subtarget.hasSSE2())
36217 if (Subtarget.hasSSE1())
36221 return TargetLowering::LowerXConstraint(ConstraintVT);
36224 /// Lower the specified operand into the Ops vector.
36225 /// If it is invalid, don't add anything to Ops.
36226 void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
36227 std::string &Constraint,
36228 std::vector<SDValue>&Ops,
36229 SelectionDAG &DAG) const {
36232 // Only support length 1 constraints for now.
36233 if (Constraint.length() > 1) return;
36235 char ConstraintLetter = Constraint[0];
36236 switch (ConstraintLetter) {
36239 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36240 if (C->getZExtValue() <= 31) {
36241 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36242 Op.getValueType());
36248 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36249 if (C->getZExtValue() <= 63) {
36250 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36251 Op.getValueType());
36257 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36258 if (isInt<8>(C->getSExtValue())) {
36259 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36260 Op.getValueType());
36266 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36267 if (C->getZExtValue() == 0xff || C->getZExtValue() == 0xffff ||
36268 (Subtarget.is64Bit() && C->getZExtValue() == 0xffffffff)) {
36269 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op),
36270 Op.getValueType());
36276 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36277 if (C->getZExtValue() <= 3) {
36278 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36279 Op.getValueType());
36285 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36286 if (C->getZExtValue() <= 255) {
36287 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36288 Op.getValueType());
36294 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36295 if (C->getZExtValue() <= 127) {
36296 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36297 Op.getValueType());
36303 // 32-bit signed value
36304 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36305 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36306 C->getSExtValue())) {
36307 // Widen to 64 bits here to get it sign extended.
36308 Result = DAG.getTargetConstant(C->getSExtValue(), SDLoc(Op), MVT::i64);
36311 // FIXME gcc accepts some relocatable values here too, but only in certain
36312 // memory models; it's complicated.
36317 // 32-bit unsigned value
36318 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
36319 if (ConstantInt::isValueValidForType(Type::getInt32Ty(*DAG.getContext()),
36320 C->getZExtValue())) {
36321 Result = DAG.getTargetConstant(C->getZExtValue(), SDLoc(Op),
36322 Op.getValueType());
36326 // FIXME gcc accepts some relocatable values here too, but only in certain
36327 // memory models; it's complicated.
36331 // Literal immediates are always ok.
36332 if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(Op)) {
36333 // Widen to 64 bits here to get it sign extended.
36334 Result = DAG.getTargetConstant(CST->getSExtValue(), SDLoc(Op), MVT::i64);
36338 // In any sort of PIC mode addresses need to be computed at runtime by
36339 // adding in a register or some sort of table lookup. These can't
36340 // be used as immediates.
36341 if (Subtarget.isPICStyleGOT() || Subtarget.isPICStyleStubPIC())
36344 // If we are in non-pic codegen mode, we allow the address of a global (with
36345 // an optional displacement) to be used with 'i'.
36346 GlobalAddressSDNode *GA = nullptr;
36347 int64_t Offset = 0;
36349 // Match either (GA), (GA+C), (GA+C1+C2), etc.
36351 if ((GA = dyn_cast<GlobalAddressSDNode>(Op))) {
36352 Offset += GA->getOffset();
36354 } else if (Op.getOpcode() == ISD::ADD) {
36355 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36356 Offset += C->getZExtValue();
36357 Op = Op.getOperand(0);
36360 } else if (Op.getOpcode() == ISD::SUB) {
36361 if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
36362 Offset += -C->getZExtValue();
36363 Op = Op.getOperand(0);
36368 // Otherwise, this isn't something we can handle, reject it.
36372 const GlobalValue *GV = GA->getGlobal();
36373 // If we require an extra load to get this address, as in PIC mode, we
36374 // can't accept it.
36375 if (isGlobalStubReference(Subtarget.classifyGlobalReference(GV)))
36378 Result = DAG.getTargetGlobalAddress(GV, SDLoc(Op),
36379 GA->getValueType(0), Offset);
36384 if (Result.getNode()) {
36385 Ops.push_back(Result);
36388 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
36391 /// Check if \p RC is a general purpose register class.
36392 /// I.e., GR* or one of their variant.
36393 static bool isGRClass(const TargetRegisterClass &RC) {
36394 return RC.hasSuperClassEq(&X86::GR8RegClass) ||
36395 RC.hasSuperClassEq(&X86::GR16RegClass) ||
36396 RC.hasSuperClassEq(&X86::GR32RegClass) ||
36397 RC.hasSuperClassEq(&X86::GR64RegClass) ||
36398 RC.hasSuperClassEq(&X86::LOW32_ADDR_ACCESS_RBPRegClass);
36401 /// Check if \p RC is a vector register class.
36402 /// I.e., FR* / VR* or one of their variant.
36403 static bool isFRClass(const TargetRegisterClass &RC) {
36404 return RC.hasSuperClassEq(&X86::FR32XRegClass) ||
36405 RC.hasSuperClassEq(&X86::FR64XRegClass) ||
36406 RC.hasSuperClassEq(&X86::VR128XRegClass) ||
36407 RC.hasSuperClassEq(&X86::VR256XRegClass) ||
36408 RC.hasSuperClassEq(&X86::VR512RegClass);
36411 std::pair<unsigned, const TargetRegisterClass *>
36412 X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
36413 StringRef Constraint,
36415 // First, see if this is a constraint that directly corresponds to an LLVM
36417 if (Constraint.size() == 1) {
36418 // GCC Constraint Letters
36419 switch (Constraint[0]) {
36421 // TODO: Slight differences here in allocation order and leaving
36422 // RIP in the class. Do they matter any more here than they do
36423 // in the normal allocation?
36425 if (Subtarget.hasAVX512()) {
36426 // Only supported in AVX512 or later.
36427 switch (VT.SimpleTy) {
36430 return std::make_pair(0U, &X86::VK32RegClass);
36432 return std::make_pair(0U, &X86::VK16RegClass);
36434 return std::make_pair(0U, &X86::VK8RegClass);
36436 return std::make_pair(0U, &X86::VK1RegClass);
36438 return std::make_pair(0U, &X86::VK64RegClass);
36442 case 'q': // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
36443 if (Subtarget.is64Bit()) {
36444 if (VT == MVT::i32 || VT == MVT::f32)
36445 return std::make_pair(0U, &X86::GR32RegClass);
36446 if (VT == MVT::i16)
36447 return std::make_pair(0U, &X86::GR16RegClass);
36448 if (VT == MVT::i8 || VT == MVT::i1)
36449 return std::make_pair(0U, &X86::GR8RegClass);
36450 if (VT == MVT::i64 || VT == MVT::f64)
36451 return std::make_pair(0U, &X86::GR64RegClass);
36455 // 32-bit fallthrough
36456 case 'Q': // Q_REGS
36457 if (VT == MVT::i32 || VT == MVT::f32)
36458 return std::make_pair(0U, &X86::GR32_ABCDRegClass);
36459 if (VT == MVT::i16)
36460 return std::make_pair(0U, &X86::GR16_ABCDRegClass);
36461 if (VT == MVT::i8 || VT == MVT::i1)
36462 return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
36463 if (VT == MVT::i64)
36464 return std::make_pair(0U, &X86::GR64_ABCDRegClass);
36466 case 'r': // GENERAL_REGS
36467 case 'l': // INDEX_REGS
36468 if (VT == MVT::i8 || VT == MVT::i1)
36469 return std::make_pair(0U, &X86::GR8RegClass);
36470 if (VT == MVT::i16)
36471 return std::make_pair(0U, &X86::GR16RegClass);
36472 if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
36473 return std::make_pair(0U, &X86::GR32RegClass);
36474 return std::make_pair(0U, &X86::GR64RegClass);
36475 case 'R': // LEGACY_REGS
36476 if (VT == MVT::i8 || VT == MVT::i1)
36477 return std::make_pair(0U, &X86::GR8_NOREXRegClass);
36478 if (VT == MVT::i16)
36479 return std::make_pair(0U, &X86::GR16_NOREXRegClass);
36480 if (VT == MVT::i32 || !Subtarget.is64Bit())
36481 return std::make_pair(0U, &X86::GR32_NOREXRegClass);
36482 return std::make_pair(0U, &X86::GR64_NOREXRegClass);
36483 case 'f': // FP Stack registers.
36484 // If SSE is enabled for this VT, use f80 to ensure the isel moves the
36485 // value to the correct fpstack register class.
36486 if (VT == MVT::f32 && !isScalarFPTypeInSSEReg(VT))
36487 return std::make_pair(0U, &X86::RFP32RegClass);
36488 if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
36489 return std::make_pair(0U, &X86::RFP64RegClass);
36490 return std::make_pair(0U, &X86::RFP80RegClass);
36491 case 'y': // MMX_REGS if MMX allowed.
36492 if (!Subtarget.hasMMX()) break;
36493 return std::make_pair(0U, &X86::VR64RegClass);
36494 case 'Y': // SSE_REGS if SSE2 allowed
36495 if (!Subtarget.hasSSE2()) break;
36498 case 'x': // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
36499 if (!Subtarget.hasSSE1()) break;
36500 bool VConstraint = (Constraint[0] == 'v');
36502 switch (VT.SimpleTy) {
36504 // Scalar SSE types.
36507 if (VConstraint && Subtarget.hasAVX512() && Subtarget.hasVLX())
36508 return std::make_pair(0U, &X86::FR32XRegClass);
36509 return std::make_pair(0U, &X86::FR32RegClass);
36512 if (VConstraint && Subtarget.hasVLX())
36513 return std::make_pair(0U, &X86::FR64XRegClass);
36514 return std::make_pair(0U, &X86::FR64RegClass);
36515 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36523 if (VConstraint && Subtarget.hasVLX())
36524 return std::make_pair(0U, &X86::VR128XRegClass);
36525 return std::make_pair(0U, &X86::VR128RegClass);
36533 if (VConstraint && Subtarget.hasVLX())
36534 return std::make_pair(0U, &X86::VR256XRegClass);
36535 return std::make_pair(0U, &X86::VR256RegClass);
36540 return std::make_pair(0U, &X86::VR512RegClass);
36544 } else if (Constraint.size() == 2 && Constraint[0] == 'Y') {
36545 switch (Constraint[1]) {
36549 // This register class doesn't allocate k0 for masked vector operation.
36550 if (Subtarget.hasAVX512()) { // Only supported in AVX512.
36551 switch (VT.SimpleTy) {
36554 return std::make_pair(0U, &X86::VK32WMRegClass);
36556 return std::make_pair(0U, &X86::VK16WMRegClass);
36558 return std::make_pair(0U, &X86::VK8WMRegClass);
36560 return std::make_pair(0U, &X86::VK1WMRegClass);
36562 return std::make_pair(0U, &X86::VK64WMRegClass);
36569 // Use the default implementation in TargetLowering to convert the register
36570 // constraint into a member of a register class.
36571 std::pair<unsigned, const TargetRegisterClass*> Res;
36572 Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
36574 // Not found as a standard register?
36576 // Map st(0) -> st(7) -> ST0
36577 if (Constraint.size() == 7 && Constraint[0] == '{' &&
36578 tolower(Constraint[1]) == 's' &&
36579 tolower(Constraint[2]) == 't' &&
36580 Constraint[3] == '(' &&
36581 (Constraint[4] >= '0' && Constraint[4] <= '7') &&
36582 Constraint[5] == ')' &&
36583 Constraint[6] == '}') {
36585 Res.first = X86::FP0+Constraint[4]-'0';
36586 Res.second = &X86::RFP80RegClass;
36590 // GCC allows "st(0)" to be called just plain "st".
36591 if (StringRef("{st}").equals_lower(Constraint)) {
36592 Res.first = X86::FP0;
36593 Res.second = &X86::RFP80RegClass;
36598 if (StringRef("{flags}").equals_lower(Constraint)) {
36599 Res.first = X86::EFLAGS;
36600 Res.second = &X86::CCRRegClass;
36604 // 'A' means [ER]AX + [ER]DX.
36605 if (Constraint == "A") {
36606 if (Subtarget.is64Bit()) {
36607 Res.first = X86::RAX;
36608 Res.second = &X86::GR64_ADRegClass;
36610 assert((Subtarget.is32Bit() || Subtarget.is16Bit()) &&
36611 "Expecting 64, 32 or 16 bit subtarget");
36612 Res.first = X86::EAX;
36613 Res.second = &X86::GR32_ADRegClass;
36620 // Otherwise, check to see if this is a register class of the wrong value
36621 // type. For example, we want to map "{ax},i32" -> {eax}, we don't want it to
36622 // turn into {ax},{dx}.
36623 // MVT::Other is used to specify clobber names.
36624 if (TRI->isTypeLegalForClass(*Res.second, VT) || VT == MVT::Other)
36625 return Res; // Correct type already, nothing to do.
36627 // Get a matching integer of the correct size. i.e. "ax" with MVT::32 should
36628 // return "eax". This should even work for things like getting 64bit integer
36629 // registers when given an f64 type.
36630 const TargetRegisterClass *Class = Res.second;
36631 // The generic code will match the first register class that contains the
36632 // given register. Thus, based on the ordering of the tablegened file,
36633 // the "plain" GR classes might not come first.
36634 // Therefore, use a helper method.
36635 if (isGRClass(*Class)) {
36636 unsigned Size = VT.getSizeInBits();
36637 if (Size == 1) Size = 8;
36638 unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
36640 Res.first = DestReg;
36641 Res.second = Size == 8 ? &X86::GR8RegClass
36642 : Size == 16 ? &X86::GR16RegClass
36643 : Size == 32 ? &X86::GR32RegClass
36644 : &X86::GR64RegClass;
36645 assert(Res.second->contains(Res.first) && "Register in register class");
36647 // No register found/type mismatch.
36649 Res.second = nullptr;
36651 } else if (isFRClass(*Class)) {
36652 // Handle references to XMM physical registers that got mapped into the
36653 // wrong class. This can happen with constraints like {xmm0} where the
36654 // target independent register mapper will just pick the first match it can
36655 // find, ignoring the required type.
36657 // TODO: Handle f128 and i128 in FR128RegClass after it is tested well.
36658 if (VT == MVT::f32 || VT == MVT::i32)
36659 Res.second = &X86::FR32RegClass;
36660 else if (VT == MVT::f64 || VT == MVT::i64)
36661 Res.second = &X86::FR64RegClass;
36662 else if (TRI->isTypeLegalForClass(X86::VR128RegClass, VT))
36663 Res.second = &X86::VR128RegClass;
36664 else if (TRI->isTypeLegalForClass(X86::VR256RegClass, VT))
36665 Res.second = &X86::VR256RegClass;
36666 else if (TRI->isTypeLegalForClass(X86::VR512RegClass, VT))
36667 Res.second = &X86::VR512RegClass;
36669 // Type mismatch and not a clobber: Return an error;
36671 Res.second = nullptr;
36678 int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
36679 const AddrMode &AM, Type *Ty,
36680 unsigned AS) const {
36681 // Scaling factors are not free at all.
36682 // An indexed folded instruction, i.e., inst (reg1, reg2, scale),
36683 // will take 2 allocations in the out of order engine instead of 1
36684 // for plain addressing mode, i.e. inst (reg1).
36686 // vaddps (%rsi,%drx), %ymm0, %ymm1
36687 // Requires two allocations (one for the load, one for the computation)
36689 // vaddps (%rsi), %ymm0, %ymm1
36690 // Requires just 1 allocation, i.e., freeing allocations for other operations
36691 // and having less micro operations to execute.
36693 // For some X86 architectures, this is even worse because for instance for
36694 // stores, the complex addressing mode forces the instruction to use the
36695 // "load" ports instead of the dedicated "store" port.
36696 // E.g., on Haswell:
36697 // vmovaps %ymm1, (%r8, %rdi) can use port 2 or 3.
36698 // vmovaps %ymm1, (%r8) can use port 2, 3, or 7.
36699 if (isLegalAddressingMode(DL, AM, Ty, AS))
36700 // Scale represents reg2 * scale, thus account for 1
36701 // as soon as we use a second register.
36702 return AM.Scale != 0;
36706 bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
36707 // Integer division on x86 is expensive. However, when aggressively optimizing
36708 // for code size, we prefer to use a div instruction, as it is usually smaller
36709 // than the alternative sequence.
36710 // The exception to this is vector division. Since x86 doesn't have vector
36711 // integer division, leaving the division as-is is a loss even in terms of
36712 // size, because it will have to be scalarized, while the alternative code
36713 // sequence can be performed in vector form.
36715 Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
36716 return OptSize && !VT.isVector();
36719 void X86TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
36720 if (!Subtarget.is64Bit())
36723 // Update IsSplitCSR in X86MachineFunctionInfo.
36724 X86MachineFunctionInfo *AFI =
36725 Entry->getParent()->getInfo<X86MachineFunctionInfo>();
36726 AFI->setIsSplitCSR(true);
36729 void X86TargetLowering::insertCopiesSplitCSR(
36730 MachineBasicBlock *Entry,
36731 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
36732 const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
36733 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
36737 const TargetInstrInfo *TII = Subtarget.getInstrInfo();
36738 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
36739 MachineBasicBlock::iterator MBBI = Entry->begin();
36740 for (const MCPhysReg *I = IStart; *I; ++I) {
36741 const TargetRegisterClass *RC = nullptr;
36742 if (X86::GR64RegClass.contains(*I))
36743 RC = &X86::GR64RegClass;
36745 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
36747 unsigned NewVR = MRI->createVirtualRegister(RC);
36748 // Create copy from CSR to a virtual register.
36749 // FIXME: this currently does not emit CFI pseudo-instructions, it works
36750 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
36751 // nounwind. If we want to generalize this later, we may need to emit
36752 // CFI pseudo-instructions.
36753 assert(Entry->getParent()->getFunction()->hasFnAttribute(
36754 Attribute::NoUnwind) &&
36755 "Function should be nounwind in insertCopiesSplitCSR!");
36756 Entry->addLiveIn(*I);
36757 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
36760 // Insert the copy-back instructions right before the terminator.
36761 for (auto *Exit : Exits)
36762 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
36763 TII->get(TargetOpcode::COPY), *I)
36768 bool X86TargetLowering::supportSwiftError() const {
36769 return Subtarget.is64Bit();
36772 /// Returns the name of the symbol used to emit stack probes or the empty
36773 /// string if not applicable.
36774 StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
36775 // If the function specifically requests stack probes, emit them.
36776 if (MF.getFunction()->hasFnAttribute("probe-stack"))
36777 return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
36779 // Generally, if we aren't on Windows, the platform ABI does not include
36780 // support for stack probes, so don't emit them.
36781 if (!Subtarget.isOSWindows() || Subtarget.isTargetMachO())
36784 // We need a stack probe to conform to the Windows ABI. Choose the right
36786 if (Subtarget.is64Bit())
36787 return Subtarget.isTargetCygMing() ? "___chkstk_ms" : "__chkstk";
36788 return Subtarget.isTargetCygMing() ? "_alloca" : "_chkstk";